stevedore-uploader 1.0.3-java → 1.0.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -5
- data/bin/upload_to_elasticsearch.rb +10 -3
- data/lib/split_archive.rb +29 -6
- data/lib/stevedore-uploader.rb +58 -39
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
|
4
|
+
data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
|
7
|
+
data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3
|
data/README.md
CHANGED
@@ -19,8 +19,23 @@ This project is in JRuby, so we can leverage the transformative enterprise stabi
|
|
19
19
|
2. be sure you're running Java 8. (java 7 is deprecated, c'mon c'mon)
|
20
20
|
3. `bundle install`
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
Command-Line Options
|
23
|
+
--------------------
|
24
|
+
````
|
25
|
+
Usage: upload_to_elasticsearch [options] target_(dir_or_csv)
|
26
|
+
-h, --host=SERVER:PORT The location of the ElasticSearch server
|
27
|
+
-i, --index=NAME A name to use for the ES index (defaults to using the directory name)
|
28
|
+
-s, --s3path=PATH The path under your bucket where these files have been uploaded. (defaults to ES index)
|
29
|
+
-b, --s3bucket=PATH The s3 bucket where these files have already been be uploaded (or will be later).
|
30
|
+
--title_column=COLNAME If target file is a CSV, which column contains the title of the row. Integer index or string column name.
|
31
|
+
--text_column=COLNAME If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name.
|
32
|
+
-o, --[no-]ocr don't attempt to OCR any PDFs, even if they contain no text
|
33
|
+
-?, --help Display this screen
|
34
|
+
````
|
35
|
+
|
36
|
+
|
37
|
+
Advanced Usage
|
38
|
+
--------------
|
24
39
|
|
25
40
|
**This is a piece of a larger upload workflow, [described here](https://github.com/newsdev/stevedore/blob/master/README.md). You should read that first, then come back here.**
|
26
41
|
|
@@ -37,12 +52,12 @@ if host isn't specified, we assume `localhost:9200`.
|
|
37
52
|
|
38
53
|
e.g.
|
39
54
|
```
|
40
|
-
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.
|
55
|
+
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ ~/code/marco-rubios-emails/emls/
|
41
56
|
```
|
42
57
|
|
43
|
-
you may also specify an
|
58
|
+
you may also specify an s3:// location of documents to parse, instead of a local directory, e.g.
|
44
59
|
```
|
45
|
-
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.
|
60
|
+
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
|
46
61
|
```
|
47
62
|
if you choose to process documents from S3, you should upload those documents using your choice of tool -- but `awscli` is a good choice. *Stevedore-Uploader does NOT upload documents to S3 on your behalf.
|
48
63
|
|
@@ -34,16 +34,21 @@ if __FILE__ == $0
|
|
34
34
|
options.s3bucket = s3bucket
|
35
35
|
end
|
36
36
|
|
37
|
-
opts.on("--
|
37
|
+
opts.on("--title-column=COLNAME",
|
38
38
|
"If target file is a CSV, which column contains the title of the row. Integer index or string column name."
|
39
39
|
) do |title_column|
|
40
40
|
options.title_column = title_column
|
41
41
|
end
|
42
|
-
opts.on("--
|
42
|
+
opts.on("--text-column=COLNAME",
|
43
43
|
"If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
|
44
44
|
) do |text_column|
|
45
45
|
options.text_column = text_column
|
46
46
|
end
|
47
|
+
opts.on("--slice-size=SLICE",
|
48
|
+
"Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
|
49
|
+
) do |slice_size|
|
50
|
+
options.slice_size = slice_size.to_i
|
51
|
+
end
|
47
52
|
|
48
53
|
opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
|
49
54
|
options.ocr = v
|
@@ -95,7 +100,9 @@ raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
|
|
95
100
|
if __FILE__ == $0
|
96
101
|
f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
|
97
102
|
f.should_ocr = options.ocr
|
98
|
-
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
103
|
+
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
104
|
+
f.slice_size = options.slice_size if options.slice_size
|
105
|
+
puts "Slice size set to #{f.slice_size}" if options.slice_size
|
99
106
|
|
100
107
|
if FOLDER.match(/\.[ct]sv$/)
|
101
108
|
f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
|
data/lib/split_archive.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# splits zip, mbox
|
1
|
+
# splits zip, mbox, pst files into their constituent documents -- messages and attachments
|
2
2
|
# and puts them into a tmp folder
|
3
3
|
# which is then parsed normally
|
4
4
|
|
@@ -11,7 +11,7 @@ require 'pst' # for PST files
|
|
11
11
|
# splits PST and Mbox formats
|
12
12
|
module Stevedore
|
13
13
|
class ArchiveSplitter
|
14
|
-
HANDLED_FORMATS = ["zip", "mbox", "pst"]
|
14
|
+
HANDLED_FORMATS = ["zip", "mbox", "pst", "eml"]
|
15
15
|
|
16
16
|
def self.split(archive_filename)
|
17
17
|
# if it's a PST use split_pst
|
@@ -28,6 +28,8 @@ module Stevedore
|
|
28
28
|
self.split_pst(archive_filename)
|
29
29
|
elsif extension == "zip"
|
30
30
|
self.split_zip(archive_filename)
|
31
|
+
elsif extension == "eml"
|
32
|
+
self.get_attachments_from_eml(archive_filename)
|
31
33
|
end
|
32
34
|
# should yield a relative filename
|
33
35
|
# and a lambda that will write the file contents to the given filename
|
@@ -36,8 +38,14 @@ module Stevedore
|
|
36
38
|
constituent_files.each_with_index do |basename_contents_lambda, idx|
|
37
39
|
basename, contents_lambda = *basename_contents_lambda
|
38
40
|
tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
|
39
|
-
|
40
|
-
|
41
|
+
FileUtils.mkdir_p(File.dirname(tmp_filename))
|
42
|
+
begin
|
43
|
+
contents_lambda.call(tmp_filename)
|
44
|
+
rescue Errno::ENOENT
|
45
|
+
puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
|
46
|
+
next
|
47
|
+
end
|
48
|
+
yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
|
41
49
|
end
|
42
50
|
end
|
43
51
|
end
|
@@ -83,18 +91,33 @@ module Stevedore
|
|
83
91
|
yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
|
84
92
|
mail = Mail.new mail_str.join("")
|
85
93
|
mail.attachments.each do |attachment|
|
86
|
-
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| attachment.
|
94
|
+
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
|
87
95
|
end
|
88
96
|
end
|
89
97
|
end
|
90
98
|
end
|
91
99
|
end
|
92
100
|
|
101
|
+
def self.get_attachments_from_eml(email_filename)
|
102
|
+
Enumerator.new do |yielder|
|
103
|
+
yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
|
104
|
+
mail = Mail.new open(email_filename){|f| f.read }
|
105
|
+
mail.attachments.each do |attachment|
|
106
|
+
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
|
93
112
|
def self.split_zip(archive_filename)
|
94
113
|
Zip::File.open(archive_filename) do |zip_file|
|
95
114
|
Enumerator.new do |yielder|
|
96
115
|
zip_file.each do |entry|
|
97
|
-
|
116
|
+
begin
|
117
|
+
yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
|
118
|
+
rescue
|
119
|
+
puts "unable to extract #{entry.name} from #{archive_filename}"
|
120
|
+
end
|
98
121
|
end
|
99
122
|
end
|
100
123
|
end
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
|
2
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
|
3
1
|
|
4
2
|
require 'rika'
|
5
|
-
|
3
|
+
require 'jruby-openssl'
|
6
4
|
require 'net/https'
|
7
5
|
require 'elasticsearch'
|
8
6
|
require 'elasticsearch/transport/transport/http/manticore'
|
@@ -12,8 +10,9 @@ require 'manticore'
|
|
12
10
|
require 'fileutils'
|
13
11
|
require 'csv'
|
14
12
|
|
15
|
-
|
16
13
|
require 'aws-sdk'
|
14
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
|
15
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
|
17
16
|
|
18
17
|
|
19
18
|
module Stevedore
|
@@ -33,15 +32,16 @@ module Stevedore
|
|
33
32
|
},
|
34
33
|
)
|
35
34
|
@es_index = es_index
|
36
|
-
@s3_bucket = s3_bucket
|
37
|
-
@s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}"
|
35
|
+
@s3_bucket = s3_bucket
|
36
|
+
@s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}/"
|
37
|
+
@use_s3 = !s3_bucket.nil?
|
38
38
|
|
39
39
|
@slice_size = 100
|
40
40
|
|
41
41
|
@should_ocr = false
|
42
42
|
|
43
43
|
self.create_index!
|
44
|
-
self.add_mapping(:doc, MAPPING)
|
44
|
+
self.add_mapping(:doc, Stevedore.const_defined?("MAPPING") ? MAPPING : DEFAULT_MAPPING)
|
45
45
|
end
|
46
46
|
|
47
47
|
def create_index!
|
@@ -92,25 +92,30 @@ module Stevedore
|
|
92
92
|
}) # was "rescue nil" but that obscured meaningful errors
|
93
93
|
end
|
94
94
|
|
95
|
-
def bulk_upload_to_es!(data, type
|
96
|
-
return nil if data.empty?
|
97
|
-
|
98
|
-
resp = @client.
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
95
|
+
def bulk_upload_to_es!(data, type=:doc)
|
96
|
+
return nil if data.compact.empty?
|
97
|
+
if data.size == 1
|
98
|
+
resp = @client.index index: @es_index, type: type, id: data.first["_id"], body: data.first
|
99
|
+
else
|
100
|
+
begin
|
101
|
+
resp = @client.bulk body: data.map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
|
102
|
+
puts resp if resp[:errors]
|
103
|
+
rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
|
104
|
+
data.each do |datum|
|
105
|
+
begin
|
106
|
+
@client.bulk body: [datum].map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} } unless datum.nil?
|
107
|
+
rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
|
108
|
+
next
|
109
|
+
end
|
106
110
|
end
|
111
|
+
resp = nil
|
107
112
|
end
|
108
113
|
resp = nil
|
109
114
|
end
|
110
115
|
resp
|
111
116
|
end
|
112
117
|
|
113
|
-
def process_document(filename,
|
118
|
+
def process_document(filename, download_url)
|
114
119
|
begin
|
115
120
|
puts "begin to process #{filename}"
|
116
121
|
# puts "size: #{File.size(filename)}"
|
@@ -121,7 +126,7 @@ module Stevedore
|
|
121
126
|
metadata = "couldn't be parsed"
|
122
127
|
end
|
123
128
|
puts "parsed: #{content.size}"
|
124
|
-
if content.size >
|
129
|
+
if content.size > 3 * (10 ** 7)
|
125
130
|
@errors << filename
|
126
131
|
puts "skipping #{filename} for being too big"
|
127
132
|
return nil
|
@@ -133,9 +138,9 @@ module Stevedore
|
|
133
138
|
# document types on its own
|
134
139
|
ret = case # .eml # .msg
|
135
140
|
when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
|
136
|
-
::Stevedore::StevedoreEmail.new_from_tika(content, metadata,
|
141
|
+
::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
|
137
142
|
when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
|
138
|
-
::Stevedore::StevedoreHTML.new_from_tika(content, metadata,
|
143
|
+
::Stevedore::StevedoreHTML.new_from_tika(content, metadata, download_url, filename).to_hash
|
139
144
|
when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
|
140
145
|
# this is a scanned PDF.
|
141
146
|
puts "scanned PDF #{File.basename(filename)} detected; OCRing"
|
@@ -146,7 +151,7 @@ module Stevedore
|
|
146
151
|
File.delete(png)
|
147
152
|
# no need to use a system call when we could use the stdlib!
|
148
153
|
# system("rm", "-f", png) rescue nil
|
149
|
-
File.delete("#{png}.txt")
|
154
|
+
File.delete("#{png}.txt") rescue nil
|
150
155
|
end.join("\n\n")
|
151
156
|
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
152
157
|
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
|
@@ -154,9 +159,9 @@ module Stevedore
|
|
154
159
|
system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
|
155
160
|
content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
|
156
161
|
puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
|
157
|
-
::Stevedore::StevedoreBlob.new_from_tika(content, metadata,
|
162
|
+
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
158
163
|
else
|
159
|
-
::Stevedore::StevedoreBlob.new_from_tika(content, metadata,
|
164
|
+
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
160
165
|
end
|
161
166
|
[ret, content, metadata]
|
162
167
|
rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
|
@@ -169,7 +174,7 @@ module Stevedore
|
|
169
174
|
end
|
170
175
|
end
|
171
176
|
|
172
|
-
def do_csv!(file, download_url, title_column=0, text_column=nil)
|
177
|
+
def do_csv!(file, download_url, title_column=0, text_column=nil, type=nil)
|
173
178
|
docs_so_far = 0
|
174
179
|
CSV.open(file, headers: (!title_column.is_a? Fixnum ) ).each_slice(@slice_size).each_with_index do |slice, slice_index|
|
175
180
|
slice_of_rows = slice.map.each_with_index do |row, i|
|
@@ -185,7 +190,7 @@ module Stevedore
|
|
185
190
|
doc
|
186
191
|
end
|
187
192
|
begin
|
188
|
-
resp = bulk_upload_to_es!(slice_of_rows.compact)
|
193
|
+
resp = bulk_upload_to_es!(slice_of_rows.compact.reject(&:empty?), type)
|
189
194
|
docs_so_far += @slice_size
|
190
195
|
rescue Manticore::Timeout, Manticore::SocketException
|
191
196
|
STDERR.puts("retrying at #{Time.now}")
|
@@ -200,7 +205,6 @@ module Stevedore
|
|
200
205
|
output_stream.puts "Processing documents from #{target_path}"
|
201
206
|
|
202
207
|
docs_so_far = 0
|
203
|
-
# use_s3 = false # option to set this (an option to set document URLs to be relative to the search engine root) is TK
|
204
208
|
@s3_bucket = target_path.gsub(/s3:\/\//i, '').split("/", 2).first if @s3_bucket.nil? && target_path.downcase.include?('s3://')
|
205
209
|
|
206
210
|
if target_path.downcase.include?("s3://")
|
@@ -237,6 +241,7 @@ module Stevedore
|
|
237
241
|
# PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
|
238
242
|
if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
|
239
243
|
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
|
244
|
+
doc = {} if doc.nil?
|
240
245
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
241
246
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
242
247
|
yield doc, obj.key, content, metadata if block_given?
|
@@ -250,31 +255,38 @@ module Stevedore
|
|
250
255
|
[doc]
|
251
256
|
end
|
252
257
|
end
|
258
|
+
retry_count = 0
|
253
259
|
begin
|
254
|
-
resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1)) # flatten, in case there's an archive
|
260
|
+
resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1).reject(&:empty?)) # flatten, in case there's an archive
|
255
261
|
puts resp.inspect if resp && resp["errors"]
|
256
262
|
rescue Manticore::Timeout, Manticore::SocketException
|
257
263
|
output_stream.puts("retrying at #{Time.now}")
|
258
|
-
|
264
|
+
if retry_count < 10
|
265
|
+
retry_count += 1
|
266
|
+
retry
|
267
|
+
else
|
268
|
+
@errors << filename
|
269
|
+
end
|
259
270
|
end
|
260
271
|
output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
261
272
|
output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
|
262
273
|
end
|
263
274
|
end
|
264
275
|
else
|
265
|
-
list_of_files = File.file?(target_path) ? [target_path] : Dir[
|
276
|
+
list_of_files = File.file?(target_path) ? [target_path] : Dir[target_path.include?('*') ? target_path : File.join(target_path, '**/*')]
|
266
277
|
list_of_files.each_slice(@slice_size) do |slice_of_files|
|
267
278
|
output_stream.puts "starting a set of #{@slice_size}"
|
268
279
|
docs_so_far += slice_of_files.size
|
269
280
|
|
270
281
|
slice_of_files.map! do |filename|
|
271
282
|
next unless File.file?(filename)
|
272
|
-
filename_basepath = filename.gsub(target_path, '')
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
283
|
+
filename_basepath = filename.gsub(target_path.split("*").first, '')
|
284
|
+
|
285
|
+
if @use_s3 # turning this on TK
|
286
|
+
download_filename = @s3_basepath + ((filename_basepath[0] == '/' || @s3_basepath[-1] == '/') ? '' : '/') + filename_basepath
|
287
|
+
else
|
288
|
+
download_filename = "/files/#{@es_index}/#{filename_basepath}"
|
289
|
+
end
|
278
290
|
|
279
291
|
# is this file an archive that contains a bunch of documents we should index separately?
|
280
292
|
# obviously, there is not a strict definition here.
|
@@ -285,6 +297,7 @@ module Stevedore
|
|
285
297
|
ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
|
286
298
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
287
299
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
300
|
+
doc["id"] = doc["sha1"]
|
288
301
|
yield doc, filename, content, metadata if block_given?
|
289
302
|
# FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
290
303
|
doc
|
@@ -295,6 +308,7 @@ module Stevedore
|
|
295
308
|
[doc]
|
296
309
|
end
|
297
310
|
end
|
311
|
+
retry_count = 0
|
298
312
|
begin
|
299
313
|
resp = bulk_upload_to_es!(slice_of_files.compact.flatten(1)) # flatten, in case there's an archive
|
300
314
|
puts resp.inspect if resp && resp["errors"]
|
@@ -303,7 +317,12 @@ module Stevedore
|
|
303
317
|
output_stream.puts "Upload error: #{e} #{e.message}."
|
304
318
|
output_stream.puts e.backtrace.join("\n") + "\n\n\n"
|
305
319
|
output_stream.puts("retrying at #{Time.now}")
|
306
|
-
|
320
|
+
if retry_count < 10
|
321
|
+
retry_count += 1
|
322
|
+
retry
|
323
|
+
else
|
324
|
+
@errors << filename
|
325
|
+
end
|
307
326
|
end
|
308
327
|
output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
309
328
|
output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
|
@@ -311,7 +330,7 @@ module Stevedore
|
|
311
330
|
end
|
312
331
|
end
|
313
332
|
end
|
314
|
-
|
333
|
+
DEFAULT_MAPPING = {
|
315
334
|
sha1: {type: :string, index: :not_analyzed},
|
316
335
|
title: { type: :string, analyzer: :keyword },
|
317
336
|
source_url: {type: :string, index: :not_analyzed},
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
46
|
+
version: 0.9.17
|
47
47
|
name: jruby-openssl
|
48
48
|
prerelease: false
|
49
49
|
type: :runtime
|
@@ -51,7 +51,7 @@ dependencies:
|
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.9.17
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|