chupa-text-decomposer-pdf 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/chupa-text-decomposer-pdf.gemspec +1 -1
- data/doc/text/news.md +6 -0
- data/lib/chupa-text/decomposers/pdf.rb +43 -8
- data/test/fixture/encrypted.odt +0 -0
- data/test/fixture/encrypted.pdf +0 -0
- data/test/run-test.rb +2 -0
- data/test/test-pdf.rb +41 -11
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 71901aa054dc878f955620473635ee11df436a2b
|
4
|
+
data.tar.gz: 68bbc514ce94a38865e48a27e1849c1baec7df9e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 66d265273482895c235c053c5cd69ae92a9510288abcc2a72a8a4f686d9942c15fe6ae4d442d13cf95bb21f5696b81aac68e354c7a023b25da1b292aaece0ade
|
7
|
+
data.tar.gz: 649fa65ae2b74fc051896afc161eb91c8db9ad04626af94d287bf32bf8961a0ce7feb521f635ccf043297172d550ce2bb1c59ea826e211c43eb6a8fa84d1a943
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-pdf"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.2"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-pdf"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
data/doc/text/news.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2014 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -29,13 +29,13 @@ module ChupaText
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def decompose(data)
|
32
|
-
document =
|
32
|
+
document = create_document(data)
|
33
33
|
text = ""
|
34
34
|
document.each do |page|
|
35
35
|
page_text = page.get_text
|
36
36
|
next if page_text.empty?
|
37
|
-
text << "\n" unless text.empty?
|
38
37
|
text << page_text
|
38
|
+
text << "\n" unless page_text.end_with?("\n")
|
39
39
|
end
|
40
40
|
text_data = TextData.new(text)
|
41
41
|
text_data.uri = data.uri
|
@@ -45,17 +45,52 @@ module ChupaText
|
|
45
45
|
add_attribute(text_data, document, :keywords)
|
46
46
|
add_attribute(text_data, document, :creator)
|
47
47
|
add_attribute(text_data, document, :producer)
|
48
|
-
add_attribute(text_data, document, :creation_date)
|
48
|
+
add_attribute(text_data, document, :creation_date, :created_time)
|
49
49
|
yield(text_data)
|
50
50
|
end
|
51
51
|
|
52
52
|
private
|
53
|
-
def
|
54
|
-
|
53
|
+
def create_document(data)
|
54
|
+
_password = password(data)
|
55
|
+
begin
|
56
|
+
wrap_stderr do
|
57
|
+
Poppler::Document.new(data.body, _password)
|
58
|
+
end
|
59
|
+
rescue GLib::Error => error
|
60
|
+
case error.code
|
61
|
+
when Poppler::Error::ENCRYPTED.to_i
|
62
|
+
raise ChupaText::EncryptedError.new(data)
|
63
|
+
else
|
64
|
+
raise ChupaText::InvalidDataError.new(data, error.message)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def password(data)
|
70
|
+
password = @options[:password]
|
71
|
+
if password.respond_to?(:call)
|
72
|
+
password = password.call(data)
|
73
|
+
end
|
74
|
+
password
|
75
|
+
end
|
76
|
+
|
77
|
+
def wrap_stderr
|
78
|
+
stderr = $stderr.dup
|
79
|
+
input, output = IO.pipe
|
80
|
+
_ = input # TODO: Report output
|
81
|
+
$stderr.reopen(output)
|
82
|
+
yield
|
83
|
+
ensure
|
84
|
+
$stderr.reopen(stderr)
|
85
|
+
end
|
86
|
+
|
87
|
+
def add_attribute(text_data, document,
|
88
|
+
pdf_attribute_name, data_attribute_name=nil)
|
89
|
+
value = document.send(pdf_attribute_name)
|
55
90
|
return if value.nil?
|
56
|
-
attribute_name = name.to_s.gsub(/_/, "-")
|
57
91
|
value = Time.at(value).utc.iso8601 if value.is_a?(Integer)
|
58
|
-
|
92
|
+
data_attribute_name ||= pdf_attribute_name.to_s.gsub(/_/, "-")
|
93
|
+
text_data[data_attribute_name] = value
|
59
94
|
end
|
60
95
|
end
|
61
96
|
end
|
Binary file
|
Binary file
|
data/test/run-test.rb
CHANGED
data/test/test-pdf.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2014 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -18,10 +18,14 @@ require "pathname"
|
|
18
18
|
|
19
19
|
class TestPDF < Test::Unit::TestCase
|
20
20
|
def setup
|
21
|
-
@
|
21
|
+
@options = {}
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
25
|
+
def decomposer
|
26
|
+
ChupaText::Decomposers::PDF.new(@options)
|
27
|
+
end
|
28
|
+
|
25
29
|
def fixture_path(*components)
|
26
30
|
base_path = Pathname(__FILE__).dirname + "fixture"
|
27
31
|
base_path.join(*components)
|
@@ -37,11 +41,11 @@ class TestPDF < Test::Unit::TestCase
|
|
37
41
|
end
|
38
42
|
|
39
43
|
def test_pdf
|
40
|
-
assert_true(
|
44
|
+
assert_true(decomposer.target?(create_data("index.pdf")))
|
41
45
|
end
|
42
46
|
|
43
47
|
def test_html
|
44
|
-
assert_false(
|
48
|
+
assert_false(decomposer.target?(create_data("index.html")))
|
45
49
|
end
|
46
50
|
end
|
47
51
|
|
@@ -53,11 +57,11 @@ class TestPDF < Test::Unit::TestCase
|
|
53
57
|
end
|
54
58
|
|
55
59
|
def test_pdf
|
56
|
-
assert_true(
|
60
|
+
assert_true(decomposer.target?(create_data("application/pdf")))
|
57
61
|
end
|
58
62
|
|
59
63
|
def test_html
|
60
|
-
assert_false(
|
64
|
+
assert_false(decomposer.target?(create_data("text/html")))
|
61
65
|
end
|
62
66
|
end
|
63
67
|
end
|
@@ -69,7 +73,7 @@ class TestPDF < Test::Unit::TestCase
|
|
69
73
|
data.mime_type = "text/pdf"
|
70
74
|
|
71
75
|
decomposed = []
|
72
|
-
|
76
|
+
decomposer.decompose(data) do |decomposed_data|
|
73
77
|
decomposed << decomposed_data
|
74
78
|
end
|
75
79
|
decomposed
|
@@ -100,8 +104,9 @@ class TestPDF < Test::Unit::TestCase
|
|
100
104
|
assert_equal(["LibreOffice 4.1"], decompose("producer"))
|
101
105
|
end
|
102
106
|
|
103
|
-
def
|
104
|
-
assert_equal([
|
107
|
+
def test_created_time
|
108
|
+
assert_equal([Time.parse("2014-01-06T00:52:45+09:00")],
|
109
|
+
decompose("created_time"))
|
105
110
|
end
|
106
111
|
|
107
112
|
private
|
@@ -114,7 +119,7 @@ class TestPDF < Test::Unit::TestCase
|
|
114
119
|
|
115
120
|
sub_test_case("one page") do
|
116
121
|
def test_body
|
117
|
-
assert_equal(["Page1"], decompose.collect(&:body))
|
122
|
+
assert_equal(["Page1\n"], decompose.collect(&:body))
|
118
123
|
end
|
119
124
|
|
120
125
|
private
|
@@ -125,7 +130,7 @@ class TestPDF < Test::Unit::TestCase
|
|
125
130
|
|
126
131
|
sub_test_case("multi pages") do
|
127
132
|
def test_body
|
128
|
-
assert_equal(["Page1\nPage2"], decompose.collect(&:body))
|
133
|
+
assert_equal(["Page1\nPage2\n"], decompose.collect(&:body))
|
129
134
|
end
|
130
135
|
|
131
136
|
private
|
@@ -133,5 +138,30 @@ class TestPDF < Test::Unit::TestCase
|
|
133
138
|
super(fixture_path("multi-pages.pdf"))
|
134
139
|
end
|
135
140
|
end
|
141
|
+
|
142
|
+
sub_test_case("encrypted") do
|
143
|
+
def test_with_password
|
144
|
+
@options = {:password => "encrypted"}
|
145
|
+
assert_equal(["Password is 'encrypted'.\n"],
|
146
|
+
decompose.collect(&:body))
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_with_password_block
|
150
|
+
@options = {:password => lambda {|data| "encrypted"}}
|
151
|
+
assert_equal(["Password is 'encrypted'.\n"],
|
152
|
+
decompose.collect(&:body))
|
153
|
+
end
|
154
|
+
|
155
|
+
def test_without_password
|
156
|
+
assert_raise(ChupaText::EncryptedError) do
|
157
|
+
decompose
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
def decompose
|
163
|
+
super(fixture_path("encrypted.pdf"))
|
164
|
+
end
|
165
|
+
end
|
136
166
|
end
|
137
167
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chupa-text
|
@@ -130,7 +130,9 @@ files:
|
|
130
130
|
- test/test-pdf.rb
|
131
131
|
- test/fixture/one-page.pdf
|
132
132
|
- test/fixture/multi-pages.pdf
|
133
|
+
- test/fixture/encrypted.pdf
|
133
134
|
- test/fixture/attributes.pdf
|
135
|
+
- test/fixture/encrypted.odt
|
134
136
|
- test/fixture/one-page.odt
|
135
137
|
- test/fixture/multi-pages.odt
|
136
138
|
- test/fixture/attributes.odt
|