stevedore-uploader 1.0.7-java → 1.0.9-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 706101e4322d8e5bfd110b035115edf7bc02f971
4
- data.tar.gz: 468209df9258209c9d86b28957a9e1ba71d13ffb
3
+ metadata.gz: d46b0968e5b625a0364cd9e165930b9f0ea0849b
4
+ data.tar.gz: 1f42e1aac685f31206f33fc6c7a68f06a94e8ab3
5
5
  SHA512:
6
- metadata.gz: 2440f6a82e52a4a7746b347c2fdbac6fbaac429cc2ad2a8267b75919a61b15bbfa78b90c94188fcb67e0e322b4ea8d26be370e76827ae9c6c9d64f507a5cc7af
7
- data.tar.gz: ab8ce1c340c94796a515c80f9cd66514daa5a415bdae2fd9efebe4f55a9b9932e302b963595bb7e108832bea3d379ac8d8eb4b98fd17dbe7aff887d1038f6fdf
6
+ metadata.gz: aa15c8bf07c5f1410396892e54b369dff1b5df14cf3521e02c7da449db84373f24c508bff3a9a28944741f8b4fb760505802b4975f62fb5aadffa516a29b6a5d
7
+ data.tar.gz: dd4c87fc77d0156dfca7324182d1f741e1babdf8320d499209d017b8b9061349f19220d5804b725225d9c794cbd2cde22bfb617a0d054fc025691368a69816b6
data/README.md CHANGED
@@ -74,4 +74,4 @@ end
74
74
  Questions?
75
75
  ==========
76
76
 
77
- Hit us up in the [Stevedore](https://github.com/newsdev/stevedore) issues. Whichever suits your fancy.
77
+ Hit us up in the [Stevedore](https://github.com/newsdev/stevedore) issues.
@@ -17,7 +17,7 @@ module Stevedore
17
17
  end
18
18
 
19
19
  def self.new_from_tika(content, metadata, download_url, filename)
20
- self.new(metadata["title"] || File.basename(filename), content, download_url)
20
+ self.new( ((metadata["title"] && metadata["title"] != "Untitled") ? metadata["title"] : File.basename(filename)), content, download_url)
21
21
  end
22
22
 
23
23
  def analyze!
@@ -26,12 +26,16 @@ module Stevedore
26
26
  end
27
27
 
28
28
  def to_hash
29
+ sha = Digest::SHA1.hexdigest(download_url)
30
+ # TODO should merge in or something?
29
31
  {
30
- "sha1" => Digest::SHA1.hexdigest(download_url),
31
- "title" => title.to_s,
32
+ "sha1" => sha,
33
+ "id" => sha,
34
+ "_id" => sha,
35
+ "title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
32
36
  "source_url" => download_url.to_s,
33
37
  "file" => {
34
- "title" => title.to_s,
38
+ "title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
35
39
  "file" => clean_text.to_s
36
40
  },
37
41
  "analyzed" => {
@@ -40,7 +44,7 @@ module Stevedore
40
44
  "Content-Type" => extra["Content-Type"] || "text/plain"
41
45
  }
42
46
  },
43
- "_updatedAt" => Time.now
47
+ "_updatedAt" => Time.now,
44
48
  }
45
49
  end
46
50
 
@@ -1,7 +1,7 @@
1
1
  require 'digest/sha1'
2
2
 
3
3
  module Stevedore
4
- class StevedoreCsvRow
4
+ class StevedoreCsvRow < StevedoreBlob
5
5
  attr_accessor :title, :text, :download_url, :whole_row, :row_num
6
6
  def initialize(title, text, row_num, download_url, whole_row={})
7
7
  self.title = title || download_url
@@ -2,12 +2,15 @@ require_relative './stevedore_blob'
2
2
  require 'cgi'
3
3
  require 'digest/sha1'
4
4
  require 'manticore'
5
+ require 'dkimverify'
6
+
7
+
5
8
  module Stevedore
6
9
  class StevedoreEmail < StevedoreBlob
7
10
 
8
11
 
9
12
  # TODO write wrt other fields. where do those go???
10
- attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type
13
+ attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type, :dkim_verified
11
14
 
12
15
  def self.new_from_tika(content, metadata, download_url, filepath)
13
16
  t = super
@@ -16,6 +19,11 @@ module Stevedore
16
19
  t.message_from = metadata["Message-From"]
17
20
  t.message_cc = metadata["Message-Cc"]
18
21
  t.title = t.subject = metadata["subject"]
22
+ t.dkim_verified = begin
23
+ Dkim::Verifier.new(filepath).verify!
24
+ rescue Dkim::DkimError
25
+ false
26
+ end
19
27
  t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
20
28
  attachment_filename = CGI::unescape(raw_attachment_filename)
21
29
  possible_filename = File.join(File.dirname(filepath), attachment_filename)
@@ -72,7 +80,8 @@ module Stevedore
72
80
  "Message-From" => message_to.is_a?(Enumerable) ? message_to : [ message_to ],
73
81
  "Message-Cc" => message_cc.is_a?(Enumerable) ? message_cc : [ message_cc ],
74
82
  "subject" => subject,
75
- "attachments" => attachments
83
+ "attachments" => attachments,
84
+ "dkim_verified" => dkim_verified
76
85
  }
77
86
  },
78
87
  "_updatedAt" => Time.now
@@ -138,7 +138,7 @@ module Stevedore
138
138
  # TODO: factor these out in favor of the yield/block situation down below.
139
139
  # this should (eventually) be totally generic, but perhaps handle common
140
140
  # document types on its own
141
- ret = case # .eml # .msg
141
+ doc = case # .eml # .msg
142
142
  when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
143
143
  ::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
144
144
  when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
@@ -156,16 +156,19 @@ module Stevedore
156
156
  File.delete("#{png}.txt") rescue nil
157
157
  end.join("\n\n")
158
158
  # e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
159
- files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
160
- return nil if files.empty?
161
- system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
162
- content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
159
+ files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
160
+ if files.empty?
161
+ content = ''
162
+ else
163
+ system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
164
+ content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
165
+ end
163
166
  puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
164
167
  ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
165
168
  else
166
169
  ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
167
170
  end
168
- [ret, content, metadata]
171
+ [doc, content, metadata]
169
172
  rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
170
173
  STDERR.puts e.inspect
171
174
  STDERR.puts "#{e} #{e.message}: #{filename}"
@@ -220,7 +223,6 @@ module Stevedore
220
223
  s3_path_without_bucket = target_path.gsub(/s3:\/\//i, '').split("/", 2).last
221
224
  bucket.objects(:prefix => s3_path_without_bucket).each_slice(@slice_size) do |slice_of_objs|
222
225
  docs_so_far += slice_of_objs.size
223
-
224
226
  output_stream.puts "starting a set of #{@slice_size} -- so far #{docs_so_far}"
225
227
  slice_of_objs.map! do |obj|
226
228
  next if obj.key[-1] == "/"
@@ -244,12 +246,19 @@ module Stevedore
244
246
  if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
245
247
  ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
246
248
  doc, content, metadata = process_document(constituent_file, download_filename)
249
+ next nil if doc.nil?
247
250
  doc["analyzed"] ||= {}
248
251
  doc["analyzed"]["metadata"] ||= {}
252
+
253
+ # this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
254
+ # we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
249
255
  doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
250
256
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
257
+ doc["id"] = doc["sha1"]
258
+ doc["_id"] = doc["sha1"]
251
259
  yield doc, obj.key, content, metadata if block_given?
252
260
  FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
261
+ doc["file"]["title"] ||= "Untitled Document: #{HumanHash::HumanHasher.new.humanize(doc["_id"])}"
253
262
  doc
254
263
  end
255
264
  else
@@ -300,14 +309,20 @@ module Stevedore
300
309
  if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
301
310
  ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
302
311
  doc, content, metadata = process_document(constituent_file, download_filename)
303
- doc = {} if doc.nil?
312
+ next nil if doc.nil?
304
313
  doc["analyzed"] ||= {}
305
314
  doc["analyzed"]["metadata"] ||= {}
315
+
316
+ # this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
317
+ # we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
306
318
  doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
307
319
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
308
320
  doc["id"] = doc["sha1"]
321
+ doc["_id"] = doc["sha1"]
322
+
309
323
  yield doc, filename, content, metadata if block_given?
310
324
  # FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
325
+ puts doc.inspect
311
326
  doc
312
327
  end
313
328
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.7
4
+ version: 1.0.9
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-14 00:00:00.000000000 Z
11
+ date: 2017-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -71,7 +71,7 @@ dependencies:
71
71
  requirements:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
- version: 1.7.1
74
+ version: '1.7'
75
75
  name: rika-stevedore
76
76
  prerelease: false
77
77
  type: :runtime
@@ -79,7 +79,7 @@ dependencies:
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 1.7.1
82
+ version: '1.7'
83
83
  - !ruby/object:Gem::Dependency
84
84
  requirement: !ruby/object:Gem::Requirement
85
85
  requirements:
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '1.1'
139
+ - !ruby/object:Gem::Dependency
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - "~>"
143
+ - !ruby/object:Gem::Version
144
+ version: 0.1.0
145
+ name: humanhash
146
+ prerelease: false
147
+ type: :runtime
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.1.0
153
+ - !ruby/object:Gem::Dependency
154
+ requirement: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - "~>"
157
+ - !ruby/object:Gem::Version
158
+ version: 0.0.4
159
+ name: dkimverify
160
+ prerelease: false
161
+ type: :runtime
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: 0.0.4
139
167
  description: TK
140
168
  email: jeremy.merrill@nytimes.com
141
169
  executables: []
@@ -170,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
170
198
  version: '0'
171
199
  requirements: []
172
200
  rubyforge_project:
173
- rubygems_version: 2.4.8
201
+ rubygems_version: 2.6.6
174
202
  signing_key:
175
203
  specification_version: 4
176
204
  summary: Upload documents to a Stevedore search engine.