stevedore-uploader 1.0.7-java → 1.0.9-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 706101e4322d8e5bfd110b035115edf7bc02f971
4
- data.tar.gz: 468209df9258209c9d86b28957a9e1ba71d13ffb
3
+ metadata.gz: d46b0968e5b625a0364cd9e165930b9f0ea0849b
4
+ data.tar.gz: 1f42e1aac685f31206f33fc6c7a68f06a94e8ab3
5
5
  SHA512:
6
- metadata.gz: 2440f6a82e52a4a7746b347c2fdbac6fbaac429cc2ad2a8267b75919a61b15bbfa78b90c94188fcb67e0e322b4ea8d26be370e76827ae9c6c9d64f507a5cc7af
7
- data.tar.gz: ab8ce1c340c94796a515c80f9cd66514daa5a415bdae2fd9efebe4f55a9b9932e302b963595bb7e108832bea3d379ac8d8eb4b98fd17dbe7aff887d1038f6fdf
6
+ metadata.gz: aa15c8bf07c5f1410396892e54b369dff1b5df14cf3521e02c7da449db84373f24c508bff3a9a28944741f8b4fb760505802b4975f62fb5aadffa516a29b6a5d
7
+ data.tar.gz: dd4c87fc77d0156dfca7324182d1f741e1babdf8320d499209d017b8b9061349f19220d5804b725225d9c794cbd2cde22bfb617a0d054fc025691368a69816b6
data/README.md CHANGED
@@ -74,4 +74,4 @@ end
74
74
  Questions?
75
75
  ==========
76
76
 
77
- Hit us up in the [Stevedore](https://github.com/newsdev/stevedore) issues. Whichever suits your fancy.
77
+ Hit us up in the [Stevedore](https://github.com/newsdev/stevedore) issues.
@@ -17,7 +17,7 @@ module Stevedore
17
17
  end
18
18
 
19
19
  def self.new_from_tika(content, metadata, download_url, filename)
20
- self.new(metadata["title"] || File.basename(filename), content, download_url)
20
+ self.new( ((metadata["title"] && metadata["title"] != "Untitled") ? metadata["title"] : File.basename(filename)), content, download_url)
21
21
  end
22
22
 
23
23
  def analyze!
@@ -26,12 +26,16 @@ module Stevedore
26
26
  end
27
27
 
28
28
  def to_hash
29
+ sha = Digest::SHA1.hexdigest(download_url)
30
+ # TODO should merge in or something?
29
31
  {
30
- "sha1" => Digest::SHA1.hexdigest(download_url),
31
- "title" => title.to_s,
32
+ "sha1" => sha,
33
+ "id" => sha,
34
+ "_id" => sha,
35
+ "title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
32
36
  "source_url" => download_url.to_s,
33
37
  "file" => {
34
- "title" => title.to_s,
38
+ "title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
35
39
  "file" => clean_text.to_s
36
40
  },
37
41
  "analyzed" => {
@@ -40,7 +44,7 @@ module Stevedore
40
44
  "Content-Type" => extra["Content-Type"] || "text/plain"
41
45
  }
42
46
  },
43
- "_updatedAt" => Time.now
47
+ "_updatedAt" => Time.now,
44
48
  }
45
49
  end
46
50
 
@@ -1,7 +1,7 @@
1
1
  require 'digest/sha1'
2
2
 
3
3
  module Stevedore
4
- class StevedoreCsvRow
4
+ class StevedoreCsvRow < StevedoreBlob
5
5
  attr_accessor :title, :text, :download_url, :whole_row, :row_num
6
6
  def initialize(title, text, row_num, download_url, whole_row={})
7
7
  self.title = title || download_url
@@ -2,12 +2,15 @@ require_relative './stevedore_blob'
2
2
  require 'cgi'
3
3
  require 'digest/sha1'
4
4
  require 'manticore'
5
+ require 'dkimverify'
6
+
7
+
5
8
  module Stevedore
6
9
  class StevedoreEmail < StevedoreBlob
7
10
 
8
11
 
9
12
  # TODO write wrt other fields. where do those go???
10
- attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type
13
+ attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type, :dkim_verified
11
14
 
12
15
  def self.new_from_tika(content, metadata, download_url, filepath)
13
16
  t = super
@@ -16,6 +19,11 @@ module Stevedore
16
19
  t.message_from = metadata["Message-From"]
17
20
  t.message_cc = metadata["Message-Cc"]
18
21
  t.title = t.subject = metadata["subject"]
22
+ t.dkim_verified = begin
23
+ Dkim::Verifier.new(filepath).verify!
24
+ rescue Dkim::DkimError
25
+ false
26
+ end
19
27
  t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
20
28
  attachment_filename = CGI::unescape(raw_attachment_filename)
21
29
  possible_filename = File.join(File.dirname(filepath), attachment_filename)
@@ -72,7 +80,8 @@ module Stevedore
72
80
  "Message-From" => message_to.is_a?(Enumerable) ? message_to : [ message_to ],
73
81
  "Message-Cc" => message_cc.is_a?(Enumerable) ? message_cc : [ message_cc ],
74
82
  "subject" => subject,
75
- "attachments" => attachments
83
+ "attachments" => attachments,
84
+ "dkim_verified" => dkim_verified
76
85
  }
77
86
  },
78
87
  "_updatedAt" => Time.now
@@ -138,7 +138,7 @@ module Stevedore
138
138
  # TODO: factor these out in favor of the yield/block situation down below.
139
139
  # this should (eventually) be totally generic, but perhaps handle common
140
140
  # document types on its own
141
- ret = case # .eml # .msg
141
+ doc = case # .eml # .msg
142
142
  when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
143
143
  ::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
144
144
  when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
@@ -156,16 +156,19 @@ module Stevedore
156
156
  File.delete("#{png}.txt") rescue nil
157
157
  end.join("\n\n")
158
158
  # e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
159
- files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
160
- return nil if files.empty?
161
- system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
162
- content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
159
+ files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
160
+ if files.empty?
161
+ content = ''
162
+ else
163
+ system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
164
+ content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
165
+ end
163
166
  puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
164
167
  ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
165
168
  else
166
169
  ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
167
170
  end
168
- [ret, content, metadata]
171
+ [doc, content, metadata]
169
172
  rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
170
173
  STDERR.puts e.inspect
171
174
  STDERR.puts "#{e} #{e.message}: #{filename}"
@@ -220,7 +223,6 @@ module Stevedore
220
223
  s3_path_without_bucket = target_path.gsub(/s3:\/\//i, '').split("/", 2).last
221
224
  bucket.objects(:prefix => s3_path_without_bucket).each_slice(@slice_size) do |slice_of_objs|
222
225
  docs_so_far += slice_of_objs.size
223
-
224
226
  output_stream.puts "starting a set of #{@slice_size} -- so far #{docs_so_far}"
225
227
  slice_of_objs.map! do |obj|
226
228
  next if obj.key[-1] == "/"
@@ -244,12 +246,19 @@ module Stevedore
244
246
  if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
245
247
  ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
246
248
  doc, content, metadata = process_document(constituent_file, download_filename)
249
+ next nil if doc.nil?
247
250
  doc["analyzed"] ||= {}
248
251
  doc["analyzed"]["metadata"] ||= {}
252
+
253
+ # this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
254
+ # we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
249
255
  doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
250
256
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
257
+ doc["id"] = doc["sha1"]
258
+ doc["_id"] = doc["sha1"]
251
259
  yield doc, obj.key, content, metadata if block_given?
252
260
  FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
261
+ doc["file"]["title"] ||= "Untitled Document: #{HumanHash::HumanHasher.new.humanize(doc["_id"])}"
253
262
  doc
254
263
  end
255
264
  else
@@ -300,14 +309,20 @@ module Stevedore
300
309
  if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
301
310
  ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
302
311
  doc, content, metadata = process_document(constituent_file, download_filename)
303
- doc = {} if doc.nil?
312
+ next nil if doc.nil?
304
313
  doc["analyzed"] ||= {}
305
314
  doc["analyzed"]["metadata"] ||= {}
315
+
316
+ # this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
317
+ # we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
306
318
  doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
307
319
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
308
320
  doc["id"] = doc["sha1"]
321
+ doc["_id"] = doc["sha1"]
322
+
309
323
  yield doc, filename, content, metadata if block_given?
310
324
  # FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
325
+ puts doc.inspect
311
326
  doc
312
327
  end
313
328
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.7
4
+ version: 1.0.9
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-14 00:00:00.000000000 Z
11
+ date: 2017-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -71,7 +71,7 @@ dependencies:
71
71
  requirements:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
- version: 1.7.1
74
+ version: '1.7'
75
75
  name: rika-stevedore
76
76
  prerelease: false
77
77
  type: :runtime
@@ -79,7 +79,7 @@ dependencies:
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 1.7.1
82
+ version: '1.7'
83
83
  - !ruby/object:Gem::Dependency
84
84
  requirement: !ruby/object:Gem::Requirement
85
85
  requirements:
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '1.1'
139
+ - !ruby/object:Gem::Dependency
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - "~>"
143
+ - !ruby/object:Gem::Version
144
+ version: 0.1.0
145
+ name: humanhash
146
+ prerelease: false
147
+ type: :runtime
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.1.0
153
+ - !ruby/object:Gem::Dependency
154
+ requirement: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - "~>"
157
+ - !ruby/object:Gem::Version
158
+ version: 0.0.4
159
+ name: dkimverify
160
+ prerelease: false
161
+ type: :runtime
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: 0.0.4
139
167
  description: TK
140
168
  email: jeremy.merrill@nytimes.com
141
169
  executables: []
@@ -170,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
170
198
  version: '0'
171
199
  requirements: []
172
200
  rubyforge_project:
173
- rubygems_version: 2.4.8
201
+ rubygems_version: 2.6.6
174
202
  signing_key:
175
203
  specification_version: 4
176
204
  summary: Upload documents to a Stevedore search engine.