stevedore-uploader 1.0.12-java → 1.0.13-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/bin/{upload_to_elasticsearch.rb → stevedore.rb} +1 -1
- data/lib/stevedore-uploader.rb +2 -2
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd37dc975bfda43876f38b9c55f258f0f3667704
|
4
|
+
data.tar.gz: 43d822366937414dc33ff300b45461ce8720405e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf35c3b48af3928e4432d48196d49e7f08ac20acfb5eeef80b58365ee4036bf2936187e6247d948b3471670fc846ffbcee5b7d8d19e179ee2e4948fe252a397a
|
7
|
+
data.tar.gz: fbf1d8c9e26e519e5ac9ef09cb56344f2a6820bd55a7770474e51b2981ac1270362af5342213eea4e880acbf1250b08b1249a35cf0eec5733353ac34425f1e68
|
data/README.md
CHANGED
@@ -22,7 +22,7 @@ This project is in JRuby, so we can leverage the transformative enterprise stabi
|
|
22
22
|
Command-Line Options
|
23
23
|
--------------------
|
24
24
|
````
|
25
|
-
Usage:
|
25
|
+
Usage: stevedore [options] target_(dir_or_csv)
|
26
26
|
-h, --host=SERVER:PORT The location of the ElasticSearch server
|
27
27
|
-i, --index=NAME A name to use for the ES index (defaults to using the directory name)
|
28
28
|
-s, --s3path=PATH The path under your bucket where these files have been uploaded. (defaults to ES index)
|
@@ -41,23 +41,23 @@ Advanced Usage
|
|
41
41
|
|
42
42
|
upload documents from your local disk
|
43
43
|
```
|
44
|
-
bundle exec ruby bin/
|
44
|
+
bundle exec ruby bin/stevedore.rb --index=INDEXNAMEx [--host=localhost:9200] [--s3path=name-of-path-under-bucket] path/to/documents/to/parse
|
45
45
|
```
|
46
46
|
or from s3
|
47
47
|
```
|
48
|
-
bundle exec ruby bin/
|
48
|
+
bundle exec ruby bin/stevedore.rb --index=INDEXNAMEx [--host=localhost:9200] s3://my-bucket/path/to/documents/to/parse
|
49
49
|
```
|
50
50
|
|
51
51
|
if host isn't specified, we assume `localhost:9200`.
|
52
52
|
|
53
53
|
e.g.
|
54
54
|
```
|
55
|
-
bundle exec ruby bin/
|
55
|
+
bundle exec ruby bin/stevedore.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ ~/code/marco-rubios-emails/emls/
|
56
56
|
```
|
57
57
|
|
58
58
|
you may also specify an s3:// location of documents to parse, instead of a local directory, e.g.
|
59
59
|
```
|
60
|
-
bundle exec ruby bin/
|
60
|
+
bundle exec ruby bin/stevedore.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
|
61
61
|
```
|
62
62
|
if you choose to process documents from S3, you should upload those documents using your choice of tool -- but `awscli` is a good choice. *Stevedore-Uploader does NOT upload documents to S3 on your behalf.
|
63
63
|
|
@@ -12,7 +12,7 @@ if __FILE__ == $0
|
|
12
12
|
options = OpenStruct.new
|
13
13
|
options.ocr = true
|
14
14
|
|
15
|
-
op = OptionParser.new("Usage:
|
15
|
+
op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
|
16
16
|
opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
|
17
17
|
"The location of the ElasticSearch server") do |host|
|
18
18
|
options.host = host
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -127,7 +127,7 @@ module Stevedore
|
|
127
127
|
return nil
|
128
128
|
end
|
129
129
|
(Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
|
130
|
-
ret = system('tesseract', png, png, "pdf", "")
|
130
|
+
ret = system('tesseract', png, png, "pdf", "", "quiet")
|
131
131
|
if ret.nil?
|
132
132
|
STDERR.puts "No tesseract (or not on path); skipping OCR"
|
133
133
|
return nil
|
@@ -187,7 +187,7 @@ module Stevedore
|
|
187
187
|
rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
|
188
188
|
STDERR.puts e.inspect
|
189
189
|
STDERR.puts "#{e} #{e.message}: #{filename}"
|
190
|
-
STDERR.puts e.backtrace.join("\n") + "\n\n\n"
|
190
|
+
STDERR.puts e.backtrace.join("\n") + "\n\n\n" if e.backtrace
|
191
191
|
# puts "\n"
|
192
192
|
@errors << filename
|
193
193
|
nil
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.13
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -166,12 +166,13 @@ dependencies:
|
|
166
166
|
version: 0.0.8
|
167
167
|
description: TK
|
168
168
|
email: jeremy.merrill@nytimes.com
|
169
|
-
executables:
|
169
|
+
executables:
|
170
|
+
- stevedore.rb
|
170
171
|
extensions: []
|
171
172
|
extra_rdoc_files: []
|
172
173
|
files:
|
173
174
|
- README.md
|
174
|
-
- bin/
|
175
|
+
- bin/stevedore.rb
|
175
176
|
- lib/parsers/stevedore_blob.rb
|
176
177
|
- lib/parsers/stevedore_csv_row.rb
|
177
178
|
- lib/parsers/stevedore_email.rb
|