stevedore-uploader 1.0.14-java → 1.0.15-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/stevedore +71 -79
- data/lib/stevedore-uploader.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 592883a568d09ec2fa6a6155ac06409f96b01386
|
4
|
+
data.tar.gz: 6aaa836860bcdd8cbc432d92ade4bb61616f9a67
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6993d8f2bf553ec22560f50e95bd5f1415aace8f11c43dd335ae80cf5b955c7fb50afe7b66f98055375f1a49e8442add0a9f65b8e2013ef43bcb5c319aeded74
|
7
|
+
data.tar.gz: 0dde17b5fe9791d24d5342448911f704ad93fedcaf715d040ebb0a323ea84d369f7f91389cc7be6472a3740583936d5b6ad0b3554b4a97c997589b8406c22fa7
|
data/bin/stevedore
CHANGED
@@ -6,69 +6,67 @@ raise Exception, "You've gotta use Java 1.8; you're on #{java.lang.System.getPro
|
|
6
6
|
|
7
7
|
require "#{File.expand_path(File.dirname(__FILE__))}/../lib/stevedore-uploader.rb"
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
options.host = host
|
19
|
-
end
|
20
|
-
|
21
|
-
opts.on("-iNAME", "--index=NAME",
|
22
|
-
"A name to use for the ES index (defaults to using the directory name)") do |index|
|
23
|
-
options.index = index
|
24
|
-
end
|
25
|
-
|
26
|
-
opts.on("-sPATH", "--s3path=PATH",
|
27
|
-
"The path under your bucket where these files have been uploaded. (defaults to ES index)"
|
28
|
-
) do |s3path|
|
29
|
-
options.s3path = s3path
|
30
|
-
end
|
31
|
-
opts.on("-bPATH", "--s3bucket=PATH",
|
32
|
-
"The s3 bucket where these files have already been be uploaded (or will be later)."
|
33
|
-
) do |s3bucket|
|
34
|
-
options.s3bucket = s3bucket
|
35
|
-
end
|
36
|
-
|
37
|
-
opts.on("--title-column=COLNAME",
|
38
|
-
"If target file is a CSV, which column contains the title of the row. Integer index or string column name."
|
39
|
-
) do |title_column|
|
40
|
-
options.title_column = title_column
|
41
|
-
end
|
42
|
-
opts.on("--text-column=COLNAME",
|
43
|
-
"If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
|
44
|
-
) do |text_column|
|
45
|
-
options.text_column = text_column
|
46
|
-
end
|
47
|
-
opts.on("--slice-size=SLICE",
|
48
|
-
"Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
|
49
|
-
) do |slice_size|
|
50
|
-
options.slice_size = slice_size.to_i
|
51
|
-
end
|
52
|
-
|
53
|
-
opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
|
54
|
-
options.ocr = v
|
55
|
-
end
|
56
|
-
|
57
|
-
opts.on( '-?', '--help', 'Display this screen' ) do
|
58
|
-
puts opts
|
59
|
-
exit
|
60
|
-
end
|
9
|
+
require 'optparse'
|
10
|
+
require 'ostruct'
|
11
|
+
options = OpenStruct.new
|
12
|
+
options.ocr = true
|
13
|
+
|
14
|
+
op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
|
15
|
+
opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
|
16
|
+
"The location of the ElasticSearch server") do |host|
|
17
|
+
options.host = host
|
61
18
|
end
|
62
19
|
|
63
|
-
|
20
|
+
opts.on("-iNAME", "--index=NAME",
|
21
|
+
"A name to use for the ES index (defaults to using the directory name)") do |index|
|
22
|
+
options.index = index
|
23
|
+
end
|
64
24
|
|
65
|
-
|
66
|
-
|
67
|
-
|
25
|
+
opts.on("-sPATH", "--s3path=PATH",
|
26
|
+
"The path under your bucket where these files have been uploaded. (defaults to ES index)"
|
27
|
+
) do |s3path|
|
28
|
+
options.s3path = s3path
|
29
|
+
end
|
30
|
+
opts.on("-bPATH", "--s3bucket=PATH",
|
31
|
+
"The s3 bucket where these files have already been be uploaded (or will be later)."
|
32
|
+
) do |s3bucket|
|
33
|
+
options.s3bucket = s3bucket
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("--title-column=COLNAME",
|
37
|
+
"If target file is a CSV, which column contains the title of the row. Integer index or string column name."
|
38
|
+
) do |title_column|
|
39
|
+
options.title_column = title_column
|
40
|
+
end
|
41
|
+
opts.on("--text-column=COLNAME",
|
42
|
+
"If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
|
43
|
+
) do |text_column|
|
44
|
+
options.text_column = text_column
|
45
|
+
end
|
46
|
+
opts.on("--slice-size=SLICE",
|
47
|
+
"Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
|
48
|
+
) do |slice_size|
|
49
|
+
options.slice_size = slice_size.to_i
|
50
|
+
end
|
51
|
+
|
52
|
+
opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
|
53
|
+
options.ocr = v
|
54
|
+
end
|
55
|
+
|
56
|
+
opts.on( '-?', '--help', 'Display this screen' ) do
|
57
|
+
puts opts
|
68
58
|
exit
|
69
59
|
end
|
70
60
|
end
|
71
61
|
|
62
|
+
op.parse!
|
63
|
+
|
64
|
+
# to delete an index: curl -X DELETE localhost:9200/indexname/
|
65
|
+
unless ARGV.length == 1
|
66
|
+
puts op
|
67
|
+
exit
|
68
|
+
end
|
69
|
+
|
72
70
|
# you can provide either a path to files locally or
|
73
71
|
# an S3 endpoint as s3://int-data-dumps/YOURINDEXNAME
|
74
72
|
FOLDER = ARGV.shift
|
@@ -93,28 +91,22 @@ S3_BASEPATH = "https://#{S3_BUCKET}.s3.amazonaws.com/#{S3_PATH}"
|
|
93
91
|
raise ArgumentError, "specify a destination" unless FOLDER
|
94
92
|
raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
|
95
93
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
|
102
|
-
f.should_ocr = options.ocr
|
103
|
-
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
104
|
-
f.slice_size = options.slice_size if options.slice_size
|
105
|
-
puts "Slice size set to #{f.slice_size}" if options.slice_size
|
106
|
-
|
107
|
-
if FOLDER.match(/\.[ct]sv$/)
|
108
|
-
f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
|
109
|
-
else
|
110
|
-
f.do!(FOLDER)
|
111
|
-
end
|
112
|
-
puts "Finished uploading documents at #{Time.now}"
|
94
|
+
f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
|
95
|
+
f.should_ocr = options.ocr
|
96
|
+
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
97
|
+
f.slice_size = options.slice_size if options.slice_size
|
98
|
+
puts "Slice size set to #{f.slice_size}" if options.slice_size
|
113
99
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
100
|
+
if FOLDER.match(/\.[ct]sv$/)
|
101
|
+
f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
|
102
|
+
else
|
103
|
+
f.do!(FOLDER)
|
104
|
+
end
|
105
|
+
puts "Finished uploading documents at #{Time.now}"
|
106
|
+
|
107
|
+
puts "Created Stevedore for #{ES_INDEX}; go check out https://stevedore.newsdev.net/search/#{ES_INDEX} or http://stevedore.adm.prd.newsdev.nytimes.com/search/#{ES_INDEX}"
|
108
|
+
if f.errors.size > 0
|
109
|
+
STDERR.puts "#{f.errors.size} failed documents:"
|
110
|
+
STDERR.puts f.errors.inspect
|
111
|
+
puts "Uploading successful, but with #{f.errors.size} errors."
|
120
112
|
end
|
data/lib/stevedore-uploader.rb
CHANGED