stevedore-uploader 1.0.3-java → 1.0.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -5
- data/bin/upload_to_elasticsearch.rb +10 -3
- data/lib/split_archive.rb +29 -6
- data/lib/stevedore-uploader.rb +58 -39
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
|
4
|
+
data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
|
7
|
+
data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3
|
data/README.md
CHANGED
@@ -19,8 +19,23 @@ This project is in JRuby, so we can leverage the transformative enterprise stabi
|
|
19
19
|
2. be sure you're running Java 8. (java 7 is deprecated, c'mon c'mon)
|
20
20
|
3. `bundle install`
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
Command-Line Options
|
23
|
+
--------------------
|
24
|
+
````
|
25
|
+
Usage: upload_to_elasticsearch [options] target_(dir_or_csv)
|
26
|
+
-h, --host=SERVER:PORT The location of the ElasticSearch server
|
27
|
+
-i, --index=NAME A name to use for the ES index (defaults to using the directory name)
|
28
|
+
-s, --s3path=PATH The path under your bucket where these files have been uploaded. (defaults to ES index)
|
29
|
+
-b, --s3bucket=PATH The s3 bucket where these files have already been be uploaded (or will be later).
|
30
|
+
--title_column=COLNAME If target file is a CSV, which column contains the title of the row. Integer index or string column name.
|
31
|
+
--text_column=COLNAME If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name.
|
32
|
+
-o, --[no-]ocr don't attempt to OCR any PDFs, even if they contain no text
|
33
|
+
-?, --help Display this screen
|
34
|
+
````
|
35
|
+
|
36
|
+
|
37
|
+
Advanced Usage
|
38
|
+
--------------
|
24
39
|
|
25
40
|
**This is a piece of a larger upload workflow, [described here](https://github.com/newsdev/stevedore/blob/master/README.md). You should read that first, then come back here.**
|
26
41
|
|
@@ -37,12 +52,12 @@ if host isn't specified, we assume `localhost:9200`.
|
|
37
52
|
|
38
53
|
e.g.
|
39
54
|
```
|
40
|
-
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.
|
55
|
+
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ ~/code/marco-rubios-emails/emls/
|
41
56
|
```
|
42
57
|
|
43
|
-
you may also specify an
|
58
|
+
you may also specify an s3:// location of documents to parse, instead of a local directory, e.g.
|
44
59
|
```
|
45
|
-
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.
|
60
|
+
bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
|
46
61
|
```
|
47
62
|
if you choose to process documents from S3, you should upload those documents using your choice of tool -- but `awscli` is a good choice. *Stevedore-Uploader does NOT upload documents to S3 on your behalf.
|
48
63
|
|
@@ -34,16 +34,21 @@ if __FILE__ == $0
|
|
34
34
|
options.s3bucket = s3bucket
|
35
35
|
end
|
36
36
|
|
37
|
-
opts.on("--
|
37
|
+
opts.on("--title-column=COLNAME",
|
38
38
|
"If target file is a CSV, which column contains the title of the row. Integer index or string column name."
|
39
39
|
) do |title_column|
|
40
40
|
options.title_column = title_column
|
41
41
|
end
|
42
|
-
opts.on("--
|
42
|
+
opts.on("--text-column=COLNAME",
|
43
43
|
"If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
|
44
44
|
) do |text_column|
|
45
45
|
options.text_column = text_column
|
46
46
|
end
|
47
|
+
opts.on("--slice-size=SLICE",
|
48
|
+
"Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
|
49
|
+
) do |slice_size|
|
50
|
+
options.slice_size = slice_size.to_i
|
51
|
+
end
|
47
52
|
|
48
53
|
opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
|
49
54
|
options.ocr = v
|
@@ -95,7 +100,9 @@ raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
|
|
95
100
|
if __FILE__ == $0
|
96
101
|
f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
|
97
102
|
f.should_ocr = options.ocr
|
98
|
-
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
103
|
+
puts "Will not OCR, per --no-ocr option" unless f.should_ocr
|
104
|
+
f.slice_size = options.slice_size if options.slice_size
|
105
|
+
puts "Slice size set to #{f.slice_size}" if options.slice_size
|
99
106
|
|
100
107
|
if FOLDER.match(/\.[ct]sv$/)
|
101
108
|
f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
|
data/lib/split_archive.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# splits zip, mbox
|
1
|
+
# splits zip, mbox, pst files into their constituent documents -- messages and attachments
|
2
2
|
# and puts them into a tmp folder
|
3
3
|
# which is then parsed normally
|
4
4
|
|
@@ -11,7 +11,7 @@ require 'pst' # for PST files
|
|
11
11
|
# splits PST and Mbox formats
|
12
12
|
module Stevedore
|
13
13
|
class ArchiveSplitter
|
14
|
-
HANDLED_FORMATS = ["zip", "mbox", "pst"]
|
14
|
+
HANDLED_FORMATS = ["zip", "mbox", "pst", "eml"]
|
15
15
|
|
16
16
|
def self.split(archive_filename)
|
17
17
|
# if it's a PST use split_pst
|
@@ -28,6 +28,8 @@ module Stevedore
|
|
28
28
|
self.split_pst(archive_filename)
|
29
29
|
elsif extension == "zip"
|
30
30
|
self.split_zip(archive_filename)
|
31
|
+
elsif extension == "eml"
|
32
|
+
self.get_attachments_from_eml(archive_filename)
|
31
33
|
end
|
32
34
|
# should yield a relative filename
|
33
35
|
# and a lambda that will write the file contents to the given filename
|
@@ -36,8 +38,14 @@ module Stevedore
|
|
36
38
|
constituent_files.each_with_index do |basename_contents_lambda, idx|
|
37
39
|
basename, contents_lambda = *basename_contents_lambda
|
38
40
|
tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
|
39
|
-
|
40
|
-
|
41
|
+
FileUtils.mkdir_p(File.dirname(tmp_filename))
|
42
|
+
begin
|
43
|
+
contents_lambda.call(tmp_filename)
|
44
|
+
rescue Errno::ENOENT
|
45
|
+
puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
|
46
|
+
next
|
47
|
+
end
|
48
|
+
yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
|
41
49
|
end
|
42
50
|
end
|
43
51
|
end
|
@@ -83,18 +91,33 @@ module Stevedore
|
|
83
91
|
yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
|
84
92
|
mail = Mail.new mail_str.join("")
|
85
93
|
mail.attachments.each do |attachment|
|
86
|
-
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| attachment.
|
94
|
+
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
|
87
95
|
end
|
88
96
|
end
|
89
97
|
end
|
90
98
|
end
|
91
99
|
end
|
92
100
|
|
101
|
+
def self.get_attachments_from_eml(email_filename)
|
102
|
+
Enumerator.new do |yielder|
|
103
|
+
yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
|
104
|
+
mail = Mail.new open(email_filename){|f| f.read }
|
105
|
+
mail.attachments.each do |attachment|
|
106
|
+
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
|
93
112
|
def self.split_zip(archive_filename)
|
94
113
|
Zip::File.open(archive_filename) do |zip_file|
|
95
114
|
Enumerator.new do |yielder|
|
96
115
|
zip_file.each do |entry|
|
97
|
-
|
116
|
+
begin
|
117
|
+
yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
|
118
|
+
rescue
|
119
|
+
puts "unable to extract #{entry.name} from #{archive_filename}"
|
120
|
+
end
|
98
121
|
end
|
99
122
|
end
|
100
123
|
end
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
|
2
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
|
3
1
|
|
4
2
|
require 'rika'
|
5
|
-
|
3
|
+
require 'jruby-openssl'
|
6
4
|
require 'net/https'
|
7
5
|
require 'elasticsearch'
|
8
6
|
require 'elasticsearch/transport/transport/http/manticore'
|
@@ -12,8 +10,9 @@ require 'manticore'
|
|
12
10
|
require 'fileutils'
|
13
11
|
require 'csv'
|
14
12
|
|
15
|
-
|
16
13
|
require 'aws-sdk'
|
14
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
|
15
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
|
17
16
|
|
18
17
|
|
19
18
|
module Stevedore
|
@@ -33,15 +32,16 @@ module Stevedore
|
|
33
32
|
},
|
34
33
|
)
|
35
34
|
@es_index = es_index
|
36
|
-
@s3_bucket = s3_bucket
|
37
|
-
@s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}"
|
35
|
+
@s3_bucket = s3_bucket
|
36
|
+
@s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}/"
|
37
|
+
@use_s3 = !s3_bucket.nil?
|
38
38
|
|
39
39
|
@slice_size = 100
|
40
40
|
|
41
41
|
@should_ocr = false
|
42
42
|
|
43
43
|
self.create_index!
|
44
|
-
self.add_mapping(:doc, MAPPING)
|
44
|
+
self.add_mapping(:doc, Stevedore.const_defined?("MAPPING") ? MAPPING : DEFAULT_MAPPING)
|
45
45
|
end
|
46
46
|
|
47
47
|
def create_index!
|
@@ -92,25 +92,30 @@ module Stevedore
|
|
92
92
|
}) # was "rescue nil" but that obscured meaningful errors
|
93
93
|
end
|
94
94
|
|
95
|
-
def bulk_upload_to_es!(data, type
|
96
|
-
return nil if data.empty?
|
97
|
-
|
98
|
-
resp = @client.
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
95
|
+
def bulk_upload_to_es!(data, type=:doc)
|
96
|
+
return nil if data.compact.empty?
|
97
|
+
if data.size == 1
|
98
|
+
resp = @client.index index: @es_index, type: type, id: data.first["_id"], body: data.first
|
99
|
+
else
|
100
|
+
begin
|
101
|
+
resp = @client.bulk body: data.map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
|
102
|
+
puts resp if resp[:errors]
|
103
|
+
rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
|
104
|
+
data.each do |datum|
|
105
|
+
begin
|
106
|
+
@client.bulk body: [datum].map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} } unless datum.nil?
|
107
|
+
rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
|
108
|
+
next
|
109
|
+
end
|
106
110
|
end
|
111
|
+
resp = nil
|
107
112
|
end
|
108
113
|
resp = nil
|
109
114
|
end
|
110
115
|
resp
|
111
116
|
end
|
112
117
|
|
113
|
-
def process_document(filename,
|
118
|
+
def process_document(filename, download_url)
|
114
119
|
begin
|
115
120
|
puts "begin to process #{filename}"
|
116
121
|
# puts "size: #{File.size(filename)}"
|
@@ -121,7 +126,7 @@ module Stevedore
|
|
121
126
|
metadata = "couldn't be parsed"
|
122
127
|
end
|
123
128
|
puts "parsed: #{content.size}"
|
124
|
-
if content.size >
|
129
|
+
if content.size > 3 * (10 ** 7)
|
125
130
|
@errors << filename
|
126
131
|
puts "skipping #{filename} for being too big"
|
127
132
|
return nil
|
@@ -133,9 +138,9 @@ module Stevedore
|
|
133
138
|
# document types on its own
|
134
139
|
ret = case # .eml # .msg
|
135
140
|
when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
|
136
|
-
::Stevedore::StevedoreEmail.new_from_tika(content, metadata,
|
141
|
+
::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
|
137
142
|
when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
|
138
|
-
::Stevedore::StevedoreHTML.new_from_tika(content, metadata,
|
143
|
+
::Stevedore::StevedoreHTML.new_from_tika(content, metadata, download_url, filename).to_hash
|
139
144
|
when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
|
140
145
|
# this is a scanned PDF.
|
141
146
|
puts "scanned PDF #{File.basename(filename)} detected; OCRing"
|
@@ -146,7 +151,7 @@ module Stevedore
|
|
146
151
|
File.delete(png)
|
147
152
|
# no need to use a system call when we could use the stdlib!
|
148
153
|
# system("rm", "-f", png) rescue nil
|
149
|
-
File.delete("#{png}.txt")
|
154
|
+
File.delete("#{png}.txt") rescue nil
|
150
155
|
end.join("\n\n")
|
151
156
|
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
152
157
|
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
|
@@ -154,9 +159,9 @@ module Stevedore
|
|
154
159
|
system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
|
155
160
|
content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
|
156
161
|
puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
|
157
|
-
::Stevedore::StevedoreBlob.new_from_tika(content, metadata,
|
162
|
+
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
158
163
|
else
|
159
|
-
::Stevedore::StevedoreBlob.new_from_tika(content, metadata,
|
164
|
+
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
160
165
|
end
|
161
166
|
[ret, content, metadata]
|
162
167
|
rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
|
@@ -169,7 +174,7 @@ module Stevedore
|
|
169
174
|
end
|
170
175
|
end
|
171
176
|
|
172
|
-
def do_csv!(file, download_url, title_column=0, text_column=nil)
|
177
|
+
def do_csv!(file, download_url, title_column=0, text_column=nil, type=nil)
|
173
178
|
docs_so_far = 0
|
174
179
|
CSV.open(file, headers: (!title_column.is_a? Fixnum ) ).each_slice(@slice_size).each_with_index do |slice, slice_index|
|
175
180
|
slice_of_rows = slice.map.each_with_index do |row, i|
|
@@ -185,7 +190,7 @@ module Stevedore
|
|
185
190
|
doc
|
186
191
|
end
|
187
192
|
begin
|
188
|
-
resp = bulk_upload_to_es!(slice_of_rows.compact)
|
193
|
+
resp = bulk_upload_to_es!(slice_of_rows.compact.reject(&:empty?), type)
|
189
194
|
docs_so_far += @slice_size
|
190
195
|
rescue Manticore::Timeout, Manticore::SocketException
|
191
196
|
STDERR.puts("retrying at #{Time.now}")
|
@@ -200,7 +205,6 @@ module Stevedore
|
|
200
205
|
output_stream.puts "Processing documents from #{target_path}"
|
201
206
|
|
202
207
|
docs_so_far = 0
|
203
|
-
# use_s3 = false # option to set this (an option to set document URLs to be relative to the search engine root) is TK
|
204
208
|
@s3_bucket = target_path.gsub(/s3:\/\//i, '').split("/", 2).first if @s3_bucket.nil? && target_path.downcase.include?('s3://')
|
205
209
|
|
206
210
|
if target_path.downcase.include?("s3://")
|
@@ -237,6 +241,7 @@ module Stevedore
|
|
237
241
|
# PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
|
238
242
|
if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
|
239
243
|
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
|
244
|
+
doc = {} if doc.nil?
|
240
245
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
241
246
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
242
247
|
yield doc, obj.key, content, metadata if block_given?
|
@@ -250,31 +255,38 @@ module Stevedore
|
|
250
255
|
[doc]
|
251
256
|
end
|
252
257
|
end
|
258
|
+
retry_count = 0
|
253
259
|
begin
|
254
|
-
resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1)) # flatten, in case there's an archive
|
260
|
+
resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1).reject(&:empty?)) # flatten, in case there's an archive
|
255
261
|
puts resp.inspect if resp && resp["errors"]
|
256
262
|
rescue Manticore::Timeout, Manticore::SocketException
|
257
263
|
output_stream.puts("retrying at #{Time.now}")
|
258
|
-
|
264
|
+
if retry_count < 10
|
265
|
+
retry_count += 1
|
266
|
+
retry
|
267
|
+
else
|
268
|
+
@errors << filename
|
269
|
+
end
|
259
270
|
end
|
260
271
|
output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
261
272
|
output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
|
262
273
|
end
|
263
274
|
end
|
264
275
|
else
|
265
|
-
list_of_files = File.file?(target_path) ? [target_path] : Dir[
|
276
|
+
list_of_files = File.file?(target_path) ? [target_path] : Dir[target_path.include?('*') ? target_path : File.join(target_path, '**/*')]
|
266
277
|
list_of_files.each_slice(@slice_size) do |slice_of_files|
|
267
278
|
output_stream.puts "starting a set of #{@slice_size}"
|
268
279
|
docs_so_far += slice_of_files.size
|
269
280
|
|
270
281
|
slice_of_files.map! do |filename|
|
271
282
|
next unless File.file?(filename)
|
272
|
-
filename_basepath = filename.gsub(target_path, '')
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
283
|
+
filename_basepath = filename.gsub(target_path.split("*").first, '')
|
284
|
+
|
285
|
+
if @use_s3 # turning this on TK
|
286
|
+
download_filename = @s3_basepath + ((filename_basepath[0] == '/' || @s3_basepath[-1] == '/') ? '' : '/') + filename_basepath
|
287
|
+
else
|
288
|
+
download_filename = "/files/#{@es_index}/#{filename_basepath}"
|
289
|
+
end
|
278
290
|
|
279
291
|
# is this file an archive that contains a bunch of documents we should index separately?
|
280
292
|
# obviously, there is not a strict definition here.
|
@@ -285,6 +297,7 @@ module Stevedore
|
|
285
297
|
ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
|
286
298
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
287
299
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
300
|
+
doc["id"] = doc["sha1"]
|
288
301
|
yield doc, filename, content, metadata if block_given?
|
289
302
|
# FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
290
303
|
doc
|
@@ -295,6 +308,7 @@ module Stevedore
|
|
295
308
|
[doc]
|
296
309
|
end
|
297
310
|
end
|
311
|
+
retry_count = 0
|
298
312
|
begin
|
299
313
|
resp = bulk_upload_to_es!(slice_of_files.compact.flatten(1)) # flatten, in case there's an archive
|
300
314
|
puts resp.inspect if resp && resp["errors"]
|
@@ -303,7 +317,12 @@ module Stevedore
|
|
303
317
|
output_stream.puts "Upload error: #{e} #{e.message}."
|
304
318
|
output_stream.puts e.backtrace.join("\n") + "\n\n\n"
|
305
319
|
output_stream.puts("retrying at #{Time.now}")
|
306
|
-
|
320
|
+
if retry_count < 10
|
321
|
+
retry_count += 1
|
322
|
+
retry
|
323
|
+
else
|
324
|
+
@errors << filename
|
325
|
+
end
|
307
326
|
end
|
308
327
|
output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
309
328
|
output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
|
@@ -311,7 +330,7 @@ module Stevedore
|
|
311
330
|
end
|
312
331
|
end
|
313
332
|
end
|
314
|
-
|
333
|
+
DEFAULT_MAPPING = {
|
315
334
|
sha1: {type: :string, index: :not_analyzed},
|
316
335
|
title: { type: :string, analyzer: :keyword },
|
317
336
|
source_url: {type: :string, index: :not_analyzed},
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
46
|
+
version: 0.9.17
|
47
47
|
name: jruby-openssl
|
48
48
|
prerelease: false
|
49
49
|
type: :runtime
|
@@ -51,7 +51,7 @@ dependencies:
|
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.9.17
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|