stevedore-uploader 1.0.14-java → 1.0.15-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/stevedore +71 -79
- data/lib/stevedore-uploader.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 592883a568d09ec2fa6a6155ac06409f96b01386
|
4
|
+
data.tar.gz: 6aaa836860bcdd8cbc432d92ade4bb61616f9a67
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6993d8f2bf553ec22560f50e95bd5f1415aace8f11c43dd335ae80cf5b955c7fb50afe7b66f98055375f1a49e8442add0a9f65b8e2013ef43bcb5c319aeded74
|
7
|
+
data.tar.gz: 0dde17b5fe9791d24d5342448911f704ad93fedcaf715d040ebb0a323ea84d369f7f91389cc7be6472a3740583936d5b6ad0b3554b4a97c997589b8406c22fa7
|
data/bin/stevedore
CHANGED
@@ -6,69 +6,67 @@ raise Exception, "You've gotta use Java 1.8; you're on #{java.lang.System.getPro
|
|
6
6
|
|
7
7
|
require "#{File.expand_path(File.dirname(__FILE__))}/../lib/stevedore-uploader.rb"
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
options.host = host
|
19
|
-
end
|
20
|
-
|
21
|
-
opts.on("-iNAME", "--index=NAME",
|
22
|
-
"A name to use for the ES index (defaults to using the directory name)") do |index|
|
23
|
-
options.index = index
|
24
|
-
end
|
25
|
-
|
26
|
-
opts.on("-sPATH", "--s3path=PATH",
|
27
|
-
"The path under your bucket where these files have been uploaded. (defaults to ES index)"
|
28
|
-
) do |s3path|
|
29
|
-
options.s3path = s3path
|
30
|
-
end
|
31
|
-
opts.on("-bPATH", "--s3bucket=PATH",
|
32
|
-
"The s3 bucket where these files have already been be uploaded (or will be later)."
|
33
|
-
) do |s3bucket|
|
34
|
-
options.s3bucket = s3bucket
|
35
|
-
end
|
36
|
-
|
37
|
-
opts.on("--title-column=COLNAME",
|
38
|
-
"If target file is a CSV, which column contains the title of the row. Integer index or string column name."
|
39
|
-
) do |title_column|
|
40
|
-
options.title_column = title_column
|
41
|
-
end
|
42
|
-
opts.on("--text-column=COLNAME",
|
43
|
-
"If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
|
44
|
-
) do |text_column|
|
45
|
-
options.text_column = text_column
|
46
|
-
end
|
47
|
-
opts.on("--slice-size=SLICE",
|
48
|
-
"Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
|
49
|
-
) do |slice_size|
|
50
|
-
options.slice_size = slice_size.to_i
|
51
|
-
end
|
52
|
-
|
53
|
-
opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
|
54
|
-
options.ocr = v
|
55
|
-
end
|
56
|
-
|
57
|
-
opts.on( '-?', '--help', 'Display this screen' ) do
|
58
|
-
puts opts
|
59
|
-
exit
|
60
|
-
end
|
9
|
+
require 'optparse'
|
10
|
+
require 'ostruct'
|
11
|
+
options = OpenStruct.new
|
12
|
+
options.ocr = true
|
13
|
+
|
14
|
+
op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
|
15
|
+
opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
|
16
|
+
"The location of the ElasticSearch server") do |host|
|
17
|
+
options.host = host
|
61
18
|
end
|
62
19
|
|
63
|
-
|
20
|
+
opts.on("-iNAME", "--index=NAME",
|
21
|
+
"A name to use for the ES index (defaults to using the directory name)") do |index|
|
22
|
+
options.index = index
|
23
|
+
end
|
64
24
|
|
65
|
-
|
66
|
-
|
67
|
-
|
25
|
+
opts.on("-sPATH", "--s3path=PATH",
|
26
|
+
"The path under your bucket where these files have been uploaded. (defaults to ES index)"
|
27
|
+
) do |s3path|
|
28
|
+
options.s3path = s3path
|
29
|
+
end
|
30
|
+
opts.on("-bPATH", "--s3bucket=PATH",
|
31
|
+
"The s3 bucket where these files have already been be uploaded (or will be later)."
|
32
|
+
) do |s3bucket|
|
33
|
+
options.s3bucket = s3bucket
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("--title-column=COLNAME",
|
37
|
+
"If target file is a CSV, which column contains the title of the row. Integer index or string column name."
|
38
|
+
) do |title_column|
|
39
|
+
options.title_column = title_column
|
40
|
+
end
|
41
|
+
opts.on("--text-column=COLNAME",
|
42
|
+
"If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
|
43
|
+
) do |text_column|
|
44
|
+
options.text_column = text_column
|
45
|
+
end
|
46
|
+
opts.on("--slice-size=SLICE",
|
47
|
+
"Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
|
48
|
+
) do |slice_size|
|
49
|
+
options.slice_size = slice_size.to_i
|
50
|
+
end
|
51
|
+
|
52
|
+
opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
|
53
|
+
options.ocr = v
|
54
|
+
end
|
55
|
+
|
56
|
+
opts.on( '-?', '--help', 'Display this screen' ) do
|
57
|
+
puts opts
|
68
58
|
exit
|
69
59
|
end
|
70
60
|
end
|
71
61
|
|
62
|
+
op.parse!
|
63
|
+
|
64
|
+
# to delete an index: curl -X DELETE localhost:9200/indexname/
|
65
|
+
unless ARGV.length == 1
|
66
|
+
puts op
|
67
|
+
exit
|
68
|
+
end
|
69
|
+
|
72
70
|
# you can provide either a path to files locally or
|
73
71
|
# an S3 endpoint as s3://int-data-dumps/YOURINDEXNAME
|
74
72
|
FOLDER = ARGV.shift
|
@@ -93,28 +91,22 @@ S3_BASEPATH = "https://#{S3_BUCKET}.s3.amazonaws.com/#{S3_PATH}"
|
|
93
91
|
raise ArgumentError, "specify a destination" unless FOLDER
|
94
92
|
raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
|
95
93
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
|
102
|
-
f.should_ocr = options.ocr
|
103
|
-
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
104
|
-
f.slice_size = options.slice_size if options.slice_size
|
105
|
-
puts "Slice size set to #{f.slice_size}" if options.slice_size
|
106
|
-
|
107
|
-
if FOLDER.match(/\.[ct]sv$/)
|
108
|
-
f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
|
109
|
-
else
|
110
|
-
f.do!(FOLDER)
|
111
|
-
end
|
112
|
-
puts "Finished uploading documents at #{Time.now}"
|
94
|
+
f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
|
95
|
+
f.should_ocr = options.ocr
|
96
|
+
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
97
|
+
f.slice_size = options.slice_size if options.slice_size
|
98
|
+
puts "Slice size set to #{f.slice_size}" if options.slice_size
|
113
99
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
100
|
+
if FOLDER.match(/\.[ct]sv$/)
|
101
|
+
f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
|
102
|
+
else
|
103
|
+
f.do!(FOLDER)
|
104
|
+
end
|
105
|
+
puts "Finished uploading documents at #{Time.now}"
|
106
|
+
|
107
|
+
puts "Created Stevedore for #{ES_INDEX}; go check out https://stevedore.newsdev.net/search/#{ES_INDEX} or http://stevedore.adm.prd.newsdev.nytimes.com/search/#{ES_INDEX}"
|
108
|
+
if f.errors.size > 0
|
109
|
+
STDERR.puts "#{f.errors.size} failed documents:"
|
110
|
+
STDERR.puts f.errors.inspect
|
111
|
+
puts "Uploading successful, but with #{f.errors.size} errors."
|
120
112
|
end
|
data/lib/stevedore-uploader.rb
CHANGED