stevedore-uploader 1.0.11-java → 1.0.12-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 61135f66cc8381bffa4e4677b0d4a4902f79044d
4
- data.tar.gz: d1e84349cd454007b163239d96c1b8fc5002bf8f
3
+ metadata.gz: f11e36b9388530683b93423de8d0fabf3997d46b
4
+ data.tar.gz: 68f2bbe57091826e63a0a4de0a39afff43c70a7a
5
5
  SHA512:
6
- metadata.gz: 2a0830f5a9ba44af16c2ddcbdebfca91860e2a8ccb70c33cc42f8af7ae560c7128c01b66db7daecaa6245e50b2c6e0ac555482f9fcf2638ed55a33126eeca87f
7
- data.tar.gz: 0f8869e3ff4beac889dacb7f049893a3b0094ef9fee0d411c2f60beae7367bf877b337a64f5e95d319c135b6db5c2f04c63f3e83ddbab585636335a2baaeca5e
6
+ metadata.gz: 57e658b3a6ee1147563861165e96ba9346befd3f72b986ace6c4c2cdf5c590e267faa889f8ab72beea8a3aabb8a0e08093001990df39fae4d436d9b18d9622cc
7
+ data.tar.gz: 298762fd18cd429f3057eb3684510c518b11a6379a2fe692c31791e41fc9c1f08c13f8f9aea21486d18546e584f6f314c50b4180d7d00af31b7d3250da18e1e1
@@ -19,9 +19,9 @@ module Stevedore
19
19
  t.message_from = metadata["Message-From"]
20
20
  t.message_cc = metadata["Message-Cc"]
21
21
  t.title = t.subject = metadata["subject"]
22
- t.dkim_verified = begin
23
- Dkim::Verifier.new(filepath).verify!
24
- rescue Dkim::DkimError
22
+ t.dkim_verified = filepath.end_with?("eml") && begin
23
+ DkimVerify::Verification::Verifier.new(open(filepath, 'r'){|f| f.read }).verify!
24
+ rescue DkimVerify::Verification::DkimError
25
25
  false
26
26
  end
27
27
  t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
@@ -117,6 +117,31 @@ module Stevedore
117
117
  resp
118
118
  end
119
119
 
120
+ def self.ocr_pdf(filename)
121
+ pdf_basename = filename.gsub(".pdf", '')
122
+ ocred_pdf_filename = "#{pdf_basename}.ocr.pdf"
123
+
124
+ ret = system("convert","-monochrome","-density","300x300",filename,"-depth",'8',"#{pdf_basename}.png")
125
+ if ret.nil?
126
+ STDERR.puts "No imagemagick or ghostscript (or not on path); skipping OCR"
127
+ return nil
128
+ end
129
+ (Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
130
+ ret = system('tesseract', png, png, "pdf", "")
131
+ if ret.nil?
132
+ STDERR.puts "No tesseract (or not on path); skipping OCR"
133
+ return nil
134
+ end
135
+ File.delete(png)
136
+ File.delete("#{png}.txt") rescue nil
137
+ end.join("\n\n")
138
+ # e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
139
+ files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 })
140
+ return nil if files.empty?
141
+ system('pdftk', *files, "cat", "output", ocred_pdf_filename)
142
+ ocred_pdf_filename
143
+ end
144
+
120
145
  def process_document(filename, download_url)
121
146
  begin
122
147
  puts "begin to process #{filename}"
@@ -128,12 +153,11 @@ module Stevedore
128
153
  metadata = "couldn't be parsed"
129
154
  end
130
155
  puts "parsed: #{content.size}"
131
- if content.size > 3 * (10 ** 7)
156
+ if content.size > 3 * (10 ** 7) # 12071712
132
157
  @errors << filename
133
158
  puts "skipping #{filename} for being too big"
134
159
  return nil
135
160
  end
136
- puts metadata["Content-Type"].inspect
137
161
 
138
162
  # TODO: factor these out in favor of the yield/block situation down below.
139
163
  # this should (eventually) be totally generic, but perhaps handle common
@@ -146,23 +170,14 @@ module Stevedore
146
170
  when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
147
171
  # this is a scanned PDF.
148
172
  puts "scanned PDF #{File.basename(filename)} detected; OCRing"
149
- pdf_basename = filename.gsub(".pdf", '')
150
- system("convert","-monochrome","-density","300x300",filename,"-depth",'8',"#{pdf_basename}.png")
151
- (Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
152
- system('tesseract', png, png, "pdf")
153
- File.delete(png)
154
- # no need to use a system call when we could use the stdlib!
155
- # system("rm", "-f", png) rescue nil
156
- File.delete("#{png}.txt") rescue nil
157
- end.join("\n\n")
158
- # e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
159
- files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
160
- if files.empty?
161
- content = ''
173
+
174
+ ocred_pdf_filename = ESUploader.ocr_pdf(filename)
175
+ if ocred_pdf_filename
176
+ content, _ = Rika.parse_content_and_metadata(ocred_pdf_filename)
162
177
  else
163
- system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
164
- content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
178
+ content = ''
165
179
  end
180
+
166
181
  puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
167
182
  ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
168
183
  else
@@ -206,6 +221,30 @@ module Stevedore
206
221
  end
207
222
  end
208
223
 
224
+ def download_from_s3(dir, obj)
225
+ tmp_filename = File.join(dir, obj.key)
226
+ retried = false
227
+ begin
228
+ body = obj.get.body.read
229
+ File.open(tmp_filename, 'wb'){|f| f << body}
230
+ rescue Aws::S3::Errors::NoSuchKey, Aws::S3::Errors::Http503Error
231
+ @errors << obj.key
232
+ rescue ArgumentError
233
+ File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
234
+ rescue Seahorse::Client::NetworkingError
235
+ unless retried
236
+ retried = true
237
+ retry
238
+ else
239
+ puts "looks like the network's down... press any key to continue once the network's back up"
240
+ gets
241
+ retry
242
+ end
243
+ end
244
+ tmp_filename
245
+ end
246
+
247
+
209
248
  def do!(target_path, output_stream=STDOUT)
210
249
  output_stream.puts "Processing documents from #{target_path}"
211
250
 
@@ -227,15 +266,7 @@ module Stevedore
227
266
  slice_of_objs.map! do |obj|
228
267
  next if obj.key[-1] == "/"
229
268
  FileUtils.mkdir_p(File.join(dir, File.dirname(obj.key)))
230
- tmp_filename = File.join(dir, obj.key)
231
- begin
232
- body = obj.get.body.read
233
- File.open(tmp_filename, 'wb'){|f| f << body}
234
- rescue Aws::S3::Errors::NoSuchKey
235
- @errors << obj.key
236
- rescue ArgumentError
237
- File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
238
- end
269
+ tmp_filename = download_from_s3(dir, obj)
239
270
  download_filename = "https://#{@s3_bucket}.s3.amazonaws.com/" + obj.key
240
271
 
241
272
  # is this file an archive that contains a bunch of documents we should index separately?
@@ -274,11 +305,11 @@ module Stevedore
274
305
  puts resp.inspect if resp && resp["errors"]
275
306
  rescue Manticore::Timeout, Manticore::SocketException
276
307
  output_stream.puts("retrying at #{Time.now}")
277
- if retry_count < 10
308
+ if retry_count < 3
278
309
  retry_count += 1
279
310
  retry
280
311
  else
281
- @errors << filename
312
+ @errors += slice_of_objs.map &:key
282
313
  end
283
314
  end
284
315
  output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
@@ -322,7 +353,6 @@ module Stevedore
322
353
 
323
354
  yield doc, filename, content, metadata if block_given?
324
355
  # FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
325
- puts doc.inspect
326
356
  doc
327
357
  end
328
358
  else
@@ -340,11 +370,12 @@ module Stevedore
340
370
  output_stream.puts "Upload error: #{e} #{e.message}."
341
371
  output_stream.puts e.backtrace.join("\n") + "\n\n\n"
342
372
  output_stream.puts("retrying at #{Time.now}")
343
- if retry_count < 10
344
- retry_count += 1
373
+ if retry_count < 3
374
+ retry_count += 1
375
+ sleep 30
345
376
  retry
346
377
  else
347
- @errors << filename
378
+ @errors << slice_of_files
348
379
  end
349
380
  end
350
381
  output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.11
4
+ version: 1.0.12
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-04 00:00:00.000000000 Z
11
+ date: 2017-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement