stevedore-uploader 1.0.7-java → 1.0.9-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/parsers/stevedore_blob.rb +9 -5
- data/lib/parsers/stevedore_csv_row.rb +1 -1
- data/lib/parsers/stevedore_email.rb +11 -2
- data/lib/stevedore-uploader.rb +23 -8
- metadata +33 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d46b0968e5b625a0364cd9e165930b9f0ea0849b
|
4
|
+
data.tar.gz: 1f42e1aac685f31206f33fc6c7a68f06a94e8ab3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa15c8bf07c5f1410396892e54b369dff1b5df14cf3521e02c7da449db84373f24c508bff3a9a28944741f8b4fb760505802b4975f62fb5aadffa516a29b6a5d
|
7
|
+
data.tar.gz: dd4c87fc77d0156dfca7324182d1f741e1babdf8320d499209d017b8b9061349f19220d5804b725225d9c794cbd2cde22bfb617a0d054fc025691368a69816b6
|
data/README.md
CHANGED
@@ -17,7 +17,7 @@ module Stevedore
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def self.new_from_tika(content, metadata, download_url, filename)
|
20
|
-
self.new(metadata["title"]
|
20
|
+
self.new( ((metadata["title"] && metadata["title"] != "Untitled") ? metadata["title"] : File.basename(filename)), content, download_url)
|
21
21
|
end
|
22
22
|
|
23
23
|
def analyze!
|
@@ -26,12 +26,16 @@ module Stevedore
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def to_hash
|
29
|
+
sha = Digest::SHA1.hexdigest(download_url)
|
30
|
+
# TODO should merge in or something?
|
29
31
|
{
|
30
|
-
"sha1" =>
|
31
|
-
"
|
32
|
+
"sha1" => sha,
|
33
|
+
"id" => sha,
|
34
|
+
"_id" => sha,
|
35
|
+
"title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
|
32
36
|
"source_url" => download_url.to_s,
|
33
37
|
"file" => {
|
34
|
-
"title" => title.to_s,
|
38
|
+
"title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
|
35
39
|
"file" => clean_text.to_s
|
36
40
|
},
|
37
41
|
"analyzed" => {
|
@@ -40,7 +44,7 @@ module Stevedore
|
|
40
44
|
"Content-Type" => extra["Content-Type"] || "text/plain"
|
41
45
|
}
|
42
46
|
},
|
43
|
-
"_updatedAt" => Time.now
|
47
|
+
"_updatedAt" => Time.now,
|
44
48
|
}
|
45
49
|
end
|
46
50
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'digest/sha1'
|
2
2
|
|
3
3
|
module Stevedore
|
4
|
-
class StevedoreCsvRow
|
4
|
+
class StevedoreCsvRow < StevedoreBlob
|
5
5
|
attr_accessor :title, :text, :download_url, :whole_row, :row_num
|
6
6
|
def initialize(title, text, row_num, download_url, whole_row={})
|
7
7
|
self.title = title || download_url
|
@@ -2,12 +2,15 @@ require_relative './stevedore_blob'
|
|
2
2
|
require 'cgi'
|
3
3
|
require 'digest/sha1'
|
4
4
|
require 'manticore'
|
5
|
+
require 'dkimverify'
|
6
|
+
|
7
|
+
|
5
8
|
module Stevedore
|
6
9
|
class StevedoreEmail < StevedoreBlob
|
7
10
|
|
8
11
|
|
9
12
|
# TODO write wrt other fields. where do those go???
|
10
|
-
attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type
|
13
|
+
attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type, :dkim_verified
|
11
14
|
|
12
15
|
def self.new_from_tika(content, metadata, download_url, filepath)
|
13
16
|
t = super
|
@@ -16,6 +19,11 @@ module Stevedore
|
|
16
19
|
t.message_from = metadata["Message-From"]
|
17
20
|
t.message_cc = metadata["Message-Cc"]
|
18
21
|
t.title = t.subject = metadata["subject"]
|
22
|
+
t.dkim_verified = begin
|
23
|
+
Dkim::Verifier.new(filepath).verify!
|
24
|
+
rescue Dkim::DkimError
|
25
|
+
false
|
26
|
+
end
|
19
27
|
t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
|
20
28
|
attachment_filename = CGI::unescape(raw_attachment_filename)
|
21
29
|
possible_filename = File.join(File.dirname(filepath), attachment_filename)
|
@@ -72,7 +80,8 @@ module Stevedore
|
|
72
80
|
"Message-From" => message_to.is_a?(Enumerable) ? message_to : [ message_to ],
|
73
81
|
"Message-Cc" => message_cc.is_a?(Enumerable) ? message_cc : [ message_cc ],
|
74
82
|
"subject" => subject,
|
75
|
-
"attachments" => attachments
|
83
|
+
"attachments" => attachments,
|
84
|
+
"dkim_verified" => dkim_verified
|
76
85
|
}
|
77
86
|
},
|
78
87
|
"_updatedAt" => Time.now
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -138,7 +138,7 @@ module Stevedore
|
|
138
138
|
# TODO: factor these out in favor of the yield/block situation down below.
|
139
139
|
# this should (eventually) be totally generic, but perhaps handle common
|
140
140
|
# document types on its own
|
141
|
-
|
141
|
+
doc = case # .eml # .msg
|
142
142
|
when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
|
143
143
|
::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
|
144
144
|
when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
|
@@ -156,16 +156,19 @@ module Stevedore
|
|
156
156
|
File.delete("#{png}.txt") rescue nil
|
157
157
|
end.join("\n\n")
|
158
158
|
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
159
|
-
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
|
160
|
-
|
161
|
-
|
162
|
-
|
159
|
+
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
|
160
|
+
if files.empty?
|
161
|
+
content = ''
|
162
|
+
else
|
163
|
+
system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
|
164
|
+
content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
|
165
|
+
end
|
163
166
|
puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
|
164
167
|
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
165
168
|
else
|
166
169
|
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
167
170
|
end
|
168
|
-
[
|
171
|
+
[doc, content, metadata]
|
169
172
|
rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
|
170
173
|
STDERR.puts e.inspect
|
171
174
|
STDERR.puts "#{e} #{e.message}: #{filename}"
|
@@ -220,7 +223,6 @@ module Stevedore
|
|
220
223
|
s3_path_without_bucket = target_path.gsub(/s3:\/\//i, '').split("/", 2).last
|
221
224
|
bucket.objects(:prefix => s3_path_without_bucket).each_slice(@slice_size) do |slice_of_objs|
|
222
225
|
docs_so_far += slice_of_objs.size
|
223
|
-
|
224
226
|
output_stream.puts "starting a set of #{@slice_size} -- so far #{docs_so_far}"
|
225
227
|
slice_of_objs.map! do |obj|
|
226
228
|
next if obj.key[-1] == "/"
|
@@ -244,12 +246,19 @@ module Stevedore
|
|
244
246
|
if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
|
245
247
|
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
246
248
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
249
|
+
next nil if doc.nil?
|
247
250
|
doc["analyzed"] ||= {}
|
248
251
|
doc["analyzed"]["metadata"] ||= {}
|
252
|
+
|
253
|
+
# this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
|
254
|
+
# we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
|
249
255
|
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
250
256
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
257
|
+
doc["id"] = doc["sha1"]
|
258
|
+
doc["_id"] = doc["sha1"]
|
251
259
|
yield doc, obj.key, content, metadata if block_given?
|
252
260
|
FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
261
|
+
doc["file"]["title"] ||= "Untitled Document: #{HumanHash::HumanHasher.new.humanize(doc["_id"])}"
|
253
262
|
doc
|
254
263
|
end
|
255
264
|
else
|
@@ -300,14 +309,20 @@ module Stevedore
|
|
300
309
|
if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
|
301
310
|
ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
302
311
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
303
|
-
|
312
|
+
next nil if doc.nil?
|
304
313
|
doc["analyzed"] ||= {}
|
305
314
|
doc["analyzed"]["metadata"] ||= {}
|
315
|
+
|
316
|
+
# this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
|
317
|
+
# we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
|
306
318
|
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
307
319
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
308
320
|
doc["id"] = doc["sha1"]
|
321
|
+
doc["_id"] = doc["sha1"]
|
322
|
+
|
309
323
|
yield doc, filename, content, metadata if block_given?
|
310
324
|
# FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
325
|
+
puts doc.inspect
|
311
326
|
doc
|
312
327
|
end
|
313
328
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -71,7 +71,7 @@ dependencies:
|
|
71
71
|
requirements:
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: 1.7
|
74
|
+
version: '1.7'
|
75
75
|
name: rika-stevedore
|
76
76
|
prerelease: false
|
77
77
|
type: :runtime
|
@@ -79,7 +79,7 @@ dependencies:
|
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.7
|
82
|
+
version: '1.7'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
requirement: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|
@@ -136,6 +136,34 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '1.1'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: 0.1.0
|
145
|
+
name: humanhash
|
146
|
+
prerelease: false
|
147
|
+
type: :runtime
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.1.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: 0.0.4
|
159
|
+
name: dkimverify
|
160
|
+
prerelease: false
|
161
|
+
type: :runtime
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: 0.0.4
|
139
167
|
description: TK
|
140
168
|
email: jeremy.merrill@nytimes.com
|
141
169
|
executables: []
|
@@ -170,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
170
198
|
version: '0'
|
171
199
|
requirements: []
|
172
200
|
rubyforge_project:
|
173
|
-
rubygems_version: 2.
|
201
|
+
rubygems_version: 2.6.6
|
174
202
|
signing_key:
|
175
203
|
specification_version: 4
|
176
204
|
summary: Upload documents to a Stevedore search engine.
|