chupa-text-decomposer-pdf 1.0.8 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/chupa-text-decomposer-pdf.gemspec +2 -2
- data/doc/text/news.md +8 -0
- data/lib/chupa-text/decomposers/pdf.rb +19 -3
- data/test/fixture/empty.pdf +1 -0
- data/test/test-pdf.rb +25 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d1885ca60dea6d4d7ffb8b4c3aa1f35dda7a1619577bad2b2526bd7d7db6180
|
4
|
+
data.tar.gz: 7d2ccc5f1dd135a5ddf281cf9d2c31215dbcad4943c12f0f03cea32e277a7a2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ab4e283c338c8f6a9157912781efa345c35c2eb39b5ee08cbdd293e9570deddb4654c138dc704af1093e1714657b7392bf63f0685eadd467ca86d480988af31
|
7
|
+
data.tar.gz: 2745efd44d1e686b53f49f6569a4d9ebcccd4937fb01ea544d1bbc580d8cefa4f958f9cde3e6bd21453a5e29889340f2662655bd09e9fdb36cc1cb6d55651703
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-pdf"
|
25
|
-
spec.version = "1.0
|
25
|
+
spec.version = "1.1.0"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-pdf"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.files += Dir.glob("doc/text/*")
|
40
40
|
spec.files += Dir.glob("test/**/*")
|
41
41
|
|
42
|
-
spec.add_runtime_dependency("chupa-text", ">= 1.1.
|
42
|
+
spec.add_runtime_dependency("chupa-text", ">= 1.1.9")
|
43
43
|
spec.add_runtime_dependency("poppler")
|
44
44
|
|
45
45
|
spec.add_development_dependency("bundler")
|
data/doc/text/news.md
CHANGED
@@ -22,6 +22,8 @@ require "poppler"
|
|
22
22
|
module ChupaText
|
23
23
|
module Decomposers
|
24
24
|
class PDF < Decomposer
|
25
|
+
include Loggable
|
26
|
+
|
25
27
|
registry.register("pdf", self)
|
26
28
|
|
27
29
|
def target?(data)
|
@@ -37,6 +39,8 @@ module ChupaText
|
|
37
39
|
|
38
40
|
def decompose(data)
|
39
41
|
document = create_document(data)
|
42
|
+
return if document.nil?
|
43
|
+
|
40
44
|
text = ""
|
41
45
|
document.each do |page|
|
42
46
|
page_text = page.get_text
|
@@ -75,7 +79,9 @@ module ChupaText
|
|
75
79
|
if path.nil?
|
76
80
|
file = Tempfile.new(["chupa-text-decomposer-pdf", ".pdf"])
|
77
81
|
file.binmode
|
78
|
-
|
82
|
+
data.open do |input|
|
83
|
+
IO.copy_stream(input, file)
|
84
|
+
end
|
79
85
|
file.close
|
80
86
|
path = file.path
|
81
87
|
end
|
@@ -85,8 +91,14 @@ module ChupaText
|
|
85
91
|
end
|
86
92
|
rescue Poppler::Error::Encrypted
|
87
93
|
raise ChupaText::EncryptedError.new(data)
|
88
|
-
rescue
|
89
|
-
|
94
|
+
rescue Poppler::Error => poppler_error
|
95
|
+
error do
|
96
|
+
message = "#{log_tag} Failed to process PDF: "
|
97
|
+
message << "#{poppler_error.class}: #{poppler_error.message}\n"
|
98
|
+
message << poppler_error.backtrace.join("\n")
|
99
|
+
message
|
100
|
+
end
|
101
|
+
nil
|
90
102
|
end
|
91
103
|
end
|
92
104
|
|
@@ -146,6 +158,10 @@ module ChupaText
|
|
146
158
|
|
147
159
|
Screenshot.new("image/png", [png.string].pack("m*"), "base64")
|
148
160
|
end
|
161
|
+
|
162
|
+
def log_tag
|
163
|
+
"[decomposer][pdf]"
|
164
|
+
end
|
149
165
|
end
|
150
166
|
end
|
151
167
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
%PDF-1
|
data/test/test-pdf.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -32,6 +32,13 @@ class TestPDF < Test::Unit::TestCase
|
|
32
32
|
base_path.join(*components)
|
33
33
|
end
|
34
34
|
|
35
|
+
def capture_log(&block)
|
36
|
+
ChupaText::CaptureLogger.capture(&block).collect do |level, message|
|
37
|
+
message = message.split("\n", 2)[0]
|
38
|
+
[level, message]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
35
42
|
sub_test_case("target?") do
|
36
43
|
sub_test_case("extension") do
|
37
44
|
def create_data(uri)
|
@@ -212,5 +219,22 @@ class TestPDF < Test::Unit::TestCase
|
|
212
219
|
end
|
213
220
|
end
|
214
221
|
end
|
222
|
+
|
223
|
+
sub_test_case("invalid") do
|
224
|
+
def test_empty
|
225
|
+
messages = capture_log do
|
226
|
+
assert_equal([],
|
227
|
+
decompose(fixture_path("empty.pdf")).collect(&:body))
|
228
|
+
end
|
229
|
+
assert_equal([
|
230
|
+
[
|
231
|
+
:error,
|
232
|
+
"[decomposer][pdf] Failed to process PDF: " +
|
233
|
+
"Poppler::Error::Damaged: PDF document is damaged",
|
234
|
+
],
|
235
|
+
],
|
236
|
+
messages)
|
237
|
+
end
|
238
|
+
end
|
215
239
|
end
|
216
240
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.1.
|
19
|
+
version: 1.1.9
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.1.
|
26
|
+
version: 1.1.9
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: poppler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -143,6 +143,7 @@ files:
|
|
143
143
|
- lib/chupa-text/decomposers/pdf.rb
|
144
144
|
- test/fixture/attributes.odt
|
145
145
|
- test/fixture/attributes.pdf
|
146
|
+
- test/fixture/empty.pdf
|
146
147
|
- test/fixture/encrypted.odt
|
147
148
|
- test/fixture/encrypted.pdf
|
148
149
|
- test/fixture/multi-pages.odt
|