stevedore-uploader 1.0.12-java → 1.0.13-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/bin/{upload_to_elasticsearch.rb → stevedore.rb} +1 -1
- data/lib/stevedore-uploader.rb +2 -2
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd37dc975bfda43876f38b9c55f258f0f3667704
|
4
|
+
data.tar.gz: 43d822366937414dc33ff300b45461ce8720405e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf35c3b48af3928e4432d48196d49e7f08ac20acfb5eeef80b58365ee4036bf2936187e6247d948b3471670fc846ffbcee5b7d8d19e179ee2e4948fe252a397a
|
7
|
+
data.tar.gz: fbf1d8c9e26e519e5ac9ef09cb56344f2a6820bd55a7770474e51b2981ac1270362af5342213eea4e880acbf1250b08b1249a35cf0eec5733353ac34425f1e68
|
data/README.md
CHANGED
@@ -22,7 +22,7 @@ This project is in JRuby, so we can leverage the transformative enterprise stabi
|
|
22
22
|
Command-Line Options
|
23
23
|
--------------------
|
24
24
|
````
|
25
|
-
Usage:
|
25
|
+
Usage: stevedore [options] target_(dir_or_csv)
|
26
26
|
-h, --host=SERVER:PORT The location of the ElasticSearch server
|
27
27
|
-i, --index=NAME A name to use for the ES index (defaults to using the directory name)
|
28
28
|
-s, --s3path=PATH The path under your bucket where these files have been uploaded. (defaults to ES index)
|
@@ -41,23 +41,23 @@ Advanced Usage
|
|
41
41
|
|
42
42
|
upload documents from your local disk
|
43
43
|
```
|
44
|
-
bundle exec ruby bin/
|
44
|
+
bundle exec ruby bin/stevedore.rb --index=INDEXNAMEx [--host=localhost:9200] [--s3path=name-of-path-under-bucket] path/to/documents/to/parse
|
45
45
|
```
|
46
46
|
or from s3
|
47
47
|
```
|
48
|
-
bundle exec ruby bin/
|
48
|
+
bundle exec ruby bin/stevedore.rb --index=INDEXNAMEx [--host=localhost:9200] s3://my-bucket/path/to/documents/to/parse
|
49
49
|
```
|
50
50
|
|
51
51
|
if host isn't specified, we assume `localhost:9200`.
|
52
52
|
|
53
53
|
e.g.
|
54
54
|
```
|
55
|
-
bundle exec ruby bin/
|
55
|
+
bundle exec ruby bin/stevedore.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ ~/code/marco-rubios-emails/emls/
|
56
56
|
```
|
57
57
|
|
58
58
|
you may also specify an s3:// location of documents to parse, instead of a local directory, e.g.
|
59
59
|
```
|
60
|
-
bundle exec ruby bin/
|
60
|
+
bundle exec ruby bin/stevedore.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
|
61
61
|
```
|
62
62
|
if you choose to process documents from S3, you should upload those documents using your choice of tool -- but `awscli` is a good choice. *Stevedore-Uploader does NOT upload documents to S3 on your behalf.
|
63
63
|
|
@@ -12,7 +12,7 @@ if __FILE__ == $0
|
|
12
12
|
options = OpenStruct.new
|
13
13
|
options.ocr = true
|
14
14
|
|
15
|
-
op = OptionParser.new("Usage:
|
15
|
+
op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
|
16
16
|
opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
|
17
17
|
"The location of the ElasticSearch server") do |host|
|
18
18
|
options.host = host
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -127,7 +127,7 @@ module Stevedore
|
|
127
127
|
return nil
|
128
128
|
end
|
129
129
|
(Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
|
130
|
-
ret = system('tesseract', png, png, "pdf", "")
|
130
|
+
ret = system('tesseract', png, png, "pdf", "", "quiet")
|
131
131
|
if ret.nil?
|
132
132
|
STDERR.puts "No tesseract (or not on path); skipping OCR"
|
133
133
|
return nil
|
@@ -187,7 +187,7 @@ module Stevedore
|
|
187
187
|
rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
|
188
188
|
STDERR.puts e.inspect
|
189
189
|
STDERR.puts "#{e} #{e.message}: #{filename}"
|
190
|
-
STDERR.puts e.backtrace.join("\n") + "\n\n\n"
|
190
|
+
STDERR.puts e.backtrace.join("\n") + "\n\n\n" if e.backtrace
|
191
191
|
# puts "\n"
|
192
192
|
@errors << filename
|
193
193
|
nil
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.13
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -166,12 +166,13 @@ dependencies:
|
|
166
166
|
version: 0.0.8
|
167
167
|
description: TK
|
168
168
|
email: jeremy.merrill@nytimes.com
|
169
|
-
executables:
|
169
|
+
executables:
|
170
|
+
- stevedore.rb
|
170
171
|
extensions: []
|
171
172
|
extra_rdoc_files: []
|
172
173
|
files:
|
173
174
|
- README.md
|
174
|
-
- bin/
|
175
|
+
- bin/stevedore.rb
|
175
176
|
- lib/parsers/stevedore_blob.rb
|
176
177
|
- lib/parsers/stevedore_csv_row.rb
|
177
178
|
- lib/parsers/stevedore_email.rb
|