stevedore-uploader 1.0.11-java → 1.0.12-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 61135f66cc8381bffa4e4677b0d4a4902f79044d
4
- data.tar.gz: d1e84349cd454007b163239d96c1b8fc5002bf8f
3
+ metadata.gz: f11e36b9388530683b93423de8d0fabf3997d46b
4
+ data.tar.gz: 68f2bbe57091826e63a0a4de0a39afff43c70a7a
5
5
  SHA512:
6
- metadata.gz: 2a0830f5a9ba44af16c2ddcbdebfca91860e2a8ccb70c33cc42f8af7ae560c7128c01b66db7daecaa6245e50b2c6e0ac555482f9fcf2638ed55a33126eeca87f
7
- data.tar.gz: 0f8869e3ff4beac889dacb7f049893a3b0094ef9fee0d411c2f60beae7367bf877b337a64f5e95d319c135b6db5c2f04c63f3e83ddbab585636335a2baaeca5e
6
+ metadata.gz: 57e658b3a6ee1147563861165e96ba9346befd3f72b986ace6c4c2cdf5c590e267faa889f8ab72beea8a3aabb8a0e08093001990df39fae4d436d9b18d9622cc
7
+ data.tar.gz: 298762fd18cd429f3057eb3684510c518b11a6379a2fe692c31791e41fc9c1f08c13f8f9aea21486d18546e584f6f314c50b4180d7d00af31b7d3250da18e1e1
@@ -19,9 +19,9 @@ module Stevedore
19
19
  t.message_from = metadata["Message-From"]
20
20
  t.message_cc = metadata["Message-Cc"]
21
21
  t.title = t.subject = metadata["subject"]
22
- t.dkim_verified = begin
23
- Dkim::Verifier.new(filepath).verify!
24
- rescue Dkim::DkimError
22
+ t.dkim_verified = filepath.end_with?("eml") && begin
23
+ DkimVerify::Verification::Verifier.new(open(filepath, 'r'){|f| f.read }).verify!
24
+ rescue DkimVerify::Verification::DkimError
25
25
  false
26
26
  end
27
27
  t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
@@ -117,6 +117,31 @@ module Stevedore
117
117
  resp
118
118
  end
119
119
 
120
+ def self.ocr_pdf(filename)
121
+ pdf_basename = filename.gsub(".pdf", '')
122
+ ocred_pdf_filename = "#{pdf_basename}.ocr.pdf"
123
+
124
+ ret = system("convert","-monochrome","-density","300x300",filename,"-depth",'8',"#{pdf_basename}.png")
125
+ if ret.nil?
126
+ STDERR.puts "No imagemagick or ghostscript (or not on path); skipping OCR"
127
+ return nil
128
+ end
129
+ (Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
130
+ ret = system('tesseract', png, png, "pdf", "")
131
+ if ret.nil?
132
+ STDERR.puts "No tesseract (or not on path); skipping OCR"
133
+ return nil
134
+ end
135
+ File.delete(png)
136
+ File.delete("#{png}.txt") rescue nil
137
+ end.join("\n\n")
138
+ # e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
139
+ files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 })
140
+ return nil if files.empty?
141
+ system('pdftk', *files, "cat", "output", ocred_pdf_filename)
142
+ ocred_pdf_filename
143
+ end
144
+
120
145
  def process_document(filename, download_url)
121
146
  begin
122
147
  puts "begin to process #{filename}"
@@ -128,12 +153,11 @@ module Stevedore
128
153
  metadata = "couldn't be parsed"
129
154
  end
130
155
  puts "parsed: #{content.size}"
131
- if content.size > 3 * (10 ** 7)
156
+ if content.size > 3 * (10 ** 7) # 12071712
132
157
  @errors << filename
133
158
  puts "skipping #{filename} for being too big"
134
159
  return nil
135
160
  end
136
- puts metadata["Content-Type"].inspect
137
161
 
138
162
  # TODO: factor these out in favor of the yield/block situation down below.
139
163
  # this should (eventually) be totally generic, but perhaps handle common
@@ -146,23 +170,14 @@ module Stevedore
146
170
  when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
147
171
  # this is a scanned PDF.
148
172
  puts "scanned PDF #{File.basename(filename)} detected; OCRing"
149
- pdf_basename = filename.gsub(".pdf", '')
150
- system("convert","-monochrome","-density","300x300",filename,"-depth",'8',"#{pdf_basename}.png")
151
- (Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
152
- system('tesseract', png, png, "pdf")
153
- File.delete(png)
154
- # no need to use a system call when we could use the stdlib!
155
- # system("rm", "-f", png) rescue nil
156
- File.delete("#{png}.txt") rescue nil
157
- end.join("\n\n")
158
- # e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
159
- files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
160
- if files.empty?
161
- content = ''
173
+
174
+ ocred_pdf_filename = ESUploader.ocr_pdf(filename)
175
+ if ocred_pdf_filename
176
+ content, _ = Rika.parse_content_and_metadata(ocred_pdf_filename)
162
177
  else
163
- system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
164
- content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
178
+ content = ''
165
179
  end
180
+
166
181
  puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
167
182
  ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
168
183
  else
@@ -206,6 +221,30 @@ module Stevedore
206
221
  end
207
222
  end
208
223
 
224
+ def download_from_s3(dir, obj)
225
+ tmp_filename = File.join(dir, obj.key)
226
+ retried = false
227
+ begin
228
+ body = obj.get.body.read
229
+ File.open(tmp_filename, 'wb'){|f| f << body}
230
+ rescue Aws::S3::Errors::NoSuchKey, Aws::S3::Errors::Http503Error
231
+ @errors << obj.key
232
+ rescue ArgumentError
233
+ File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
234
+ rescue Seahorse::Client::NetworkingError
235
+ unless retried
236
+ retried = true
237
+ retry
238
+ else
239
+ puts "looks like the network's down... press any key to continue once the network's back up"
240
+ gets
241
+ retry
242
+ end
243
+ end
244
+ tmp_filename
245
+ end
246
+
247
+
209
248
  def do!(target_path, output_stream=STDOUT)
210
249
  output_stream.puts "Processing documents from #{target_path}"
211
250
 
@@ -227,15 +266,7 @@ module Stevedore
227
266
  slice_of_objs.map! do |obj|
228
267
  next if obj.key[-1] == "/"
229
268
  FileUtils.mkdir_p(File.join(dir, File.dirname(obj.key)))
230
- tmp_filename = File.join(dir, obj.key)
231
- begin
232
- body = obj.get.body.read
233
- File.open(tmp_filename, 'wb'){|f| f << body}
234
- rescue Aws::S3::Errors::NoSuchKey
235
- @errors << obj.key
236
- rescue ArgumentError
237
- File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
238
- end
269
+ tmp_filename = download_from_s3(dir, obj)
239
270
  download_filename = "https://#{@s3_bucket}.s3.amazonaws.com/" + obj.key
240
271
 
241
272
  # is this file an archive that contains a bunch of documents we should index separately?
@@ -274,11 +305,11 @@ module Stevedore
274
305
  puts resp.inspect if resp && resp["errors"]
275
306
  rescue Manticore::Timeout, Manticore::SocketException
276
307
  output_stream.puts("retrying at #{Time.now}")
277
- if retry_count < 10
308
+ if retry_count < 3
278
309
  retry_count += 1
279
310
  retry
280
311
  else
281
- @errors << filename
312
+ @errors += slice_of_objs.map &:key
282
313
  end
283
314
  end
284
315
  output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
@@ -322,7 +353,6 @@ module Stevedore
322
353
 
323
354
  yield doc, filename, content, metadata if block_given?
324
355
  # FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
325
- puts doc.inspect
326
356
  doc
327
357
  end
328
358
  else
@@ -340,11 +370,12 @@ module Stevedore
340
370
  output_stream.puts "Upload error: #{e} #{e.message}."
341
371
  output_stream.puts e.backtrace.join("\n") + "\n\n\n"
342
372
  output_stream.puts("retrying at #{Time.now}")
343
- if retry_count < 10
344
- retry_count += 1
373
+ if retry_count < 3
374
+ retry_count += 1
375
+ sleep 30
345
376
  retry
346
377
  else
347
- @errors << filename
378
+ @errors << slice_of_files
348
379
  end
349
380
  end
350
381
  output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.11
4
+ version: 1.0.12
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-04 00:00:00.000000000 Z
11
+ date: 2017-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement