chupa-text-decomposer-abiword 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +5 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +502 -0
- data/README.md +38 -0
- data/Rakefile +48 -0
- data/chupa-text-decomposer-abiword.gemspec +50 -0
- data/doc/text/news.md +5 -0
- data/lib/chupa-text/decomposers/abiword.rb +132 -0
- data/test/fixture/abw/multi-pages.abw +43 -0
- data/test/fixture/abw/one-page.abw +41 -0
- data/test/fixture/doc/multi-pages.doc +0 -0
- data/test/fixture/doc/one-page.doc +0 -0
- data/test/fixture/docx/multi-pages.docx +0 -0
- data/test/fixture/docx/one-page.docx +0 -0
- data/test/fixture/odt/multi-pages.odt +0 -0
- data/test/fixture/odt/one-page.odt +0 -0
- data/test/fixture/rtf/multi-pages.rtf +19 -0
- data/test/fixture/rtf/one-page.rtf +17 -0
- data/test/fixture/zabw/multi-pages.zabw +0 -0
- data/test/fixture/zabw/one-page.zabw +0 -0
- data/test/helper.rb +57 -0
- data/test/run-test.rb +31 -0
- data/test/test-abw.rb +84 -0
- data/test/test-doc.rb +84 -0
- data/test/test-docx.rb +84 -0
- data/test/test-odt.rb +84 -0
- data/test/test-rtf.rb +84 -0
- data/test/test-zabw.rb +71 -0
- metadata +176 -0
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# README
|
2
|
+
|
3
|
+
## Name
|
4
|
+
|
5
|
+
chupa-text-decomposer-abiword
|
6
|
+
|
7
|
+
## Description
|
8
|
+
|
9
|
+
This is a ChupaText decomposer plugin for to extract text and
|
10
|
+
meta-data from office documents such as Microsoft Word files and
|
11
|
+
LibreOffice Writer files.
|
12
|
+
|
13
|
+
You can use `abiword` decomposer.
|
14
|
+
|
15
|
+
## Install
|
16
|
+
|
17
|
+
Install chupa-text-decomposer-abiword gem:
|
18
|
+
|
19
|
+
```
|
20
|
+
% gem install chupa-text-decomposer-abiword
|
21
|
+
```
|
22
|
+
|
23
|
+
Now, you can extract text and meta-data from office documents:
|
24
|
+
|
25
|
+
```
|
26
|
+
% chupa-text document.doc
|
27
|
+
```
|
28
|
+
|
29
|
+
## Author
|
30
|
+
|
31
|
+
* Sutou Kouhei `<kou@clear-code.com>`
|
32
|
+
|
33
|
+
## License
|
34
|
+
|
35
|
+
LGPL 2.1 or later.
|
36
|
+
|
37
|
+
(Sutou Kouhei has a right to change the license including contributed
|
38
|
+
patches.)
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
#
|
3
|
+
# Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
|
4
|
+
#
|
5
|
+
# This library is free software; you can redistribute it and/or
|
6
|
+
# modify it under the terms of the GNU Lesser General Public
|
7
|
+
# License as published by the Free Software Foundation; either
|
8
|
+
# version 2.1 of the License, or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This library is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
# Lesser General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU Lesser General Public
|
16
|
+
# License along with this library; if not, write to the Free Software
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
|
19
|
+
task :default => :test
|
20
|
+
|
21
|
+
require "pathname"
|
22
|
+
|
23
|
+
require "rubygems"
|
24
|
+
require "bundler/gem_helper"
|
25
|
+
require "packnga"
|
26
|
+
|
27
|
+
base_dir = Pathname(__FILE__).dirname
|
28
|
+
|
29
|
+
helper = Bundler::GemHelper.new(base_dir.to_s)
|
30
|
+
def helper.version_tag
|
31
|
+
version
|
32
|
+
end
|
33
|
+
|
34
|
+
helper.install
|
35
|
+
spec = helper.gemspec
|
36
|
+
|
37
|
+
Packnga::DocumentTask.new(spec) do |task|
|
38
|
+
task.original_language = "en"
|
39
|
+
task.translate_language = "ja"
|
40
|
+
end
|
41
|
+
|
42
|
+
Packnga::ReleaseTask.new(spec) do
|
43
|
+
end
|
44
|
+
|
45
|
+
desc "Run tests"
|
46
|
+
task :test do
|
47
|
+
ruby("test/run-test.rb")
|
48
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
#
|
3
|
+
# Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
|
4
|
+
#
|
5
|
+
# This library is free software; you can redistribute it and/or
|
6
|
+
# modify it under the terms of the GNU Lesser General Public
|
7
|
+
# License as published by the Free Software Foundation; either
|
8
|
+
# version 2.1 of the License, or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This library is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
# Lesser General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU Lesser General Public
|
16
|
+
# License along with this library; if not, write to the Free Software
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
|
19
|
+
clean_white_space = lambda do |entry|
|
20
|
+
entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
|
21
|
+
end
|
22
|
+
|
23
|
+
Gem::Specification.new do |spec|
|
24
|
+
spec.name = "chupa-text-decomposer-abiword"
|
25
|
+
spec.version = "1.0.0"
|
26
|
+
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-abiword"
|
27
|
+
spec.authors = ["Sutou Kouhei"]
|
28
|
+
spec.email = ["kou@clear-code.com"]
|
29
|
+
readme = File.read("README.md", encoding: "UTF-8")
|
30
|
+
entries = readme.split(/^\#\#\s(.*)$/)
|
31
|
+
description = clean_white_space.call(entries[entries.index("Description") + 1])
|
32
|
+
spec.summary = description.split(/\n\n+/, 2).first
|
33
|
+
spec.description = description
|
34
|
+
spec.license = "LGPL-2.1+"
|
35
|
+
spec.files = ["#{spec.name}.gemspec"]
|
36
|
+
spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
|
37
|
+
spec.files += [".yardopts"]
|
38
|
+
spec.files += Dir.glob("lib/**/*.rb")
|
39
|
+
spec.files += Dir.glob("doc/text/*")
|
40
|
+
spec.files += Dir.glob("test/**/*")
|
41
|
+
|
42
|
+
spec.add_runtime_dependency("chupa-text")
|
43
|
+
spec.add_runtime_dependency("chupa-text-decomposer-pdf")
|
44
|
+
|
45
|
+
spec.add_development_dependency("bundler")
|
46
|
+
spec.add_development_dependency("rake")
|
47
|
+
spec.add_development_dependency("test-unit")
|
48
|
+
spec.add_development_dependency("packnga")
|
49
|
+
spec.add_development_dependency("kramdown")
|
50
|
+
end
|
data/doc/text/news.md
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
# Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "tempfile"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Decomposers
|
21
|
+
class AbiWord < Decomposer
|
22
|
+
include Loggable
|
23
|
+
|
24
|
+
registry.register("abiword", self)
|
25
|
+
|
26
|
+
EXTENSIONS = [
|
27
|
+
"abw",
|
28
|
+
"doc",
|
29
|
+
"docx",
|
30
|
+
"odt",
|
31
|
+
"rtf",
|
32
|
+
"zabw",
|
33
|
+
]
|
34
|
+
MIME_TYPES = [
|
35
|
+
"application/msword",
|
36
|
+
"application/rtf",
|
37
|
+
"application/vnd.oasis.opendocument.text",
|
38
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
39
|
+
"application/x-abiword",
|
40
|
+
]
|
41
|
+
|
42
|
+
def initialize(options)
|
43
|
+
super
|
44
|
+
@command = find_command
|
45
|
+
debug do
|
46
|
+
if @command
|
47
|
+
"#{log_tag}[command][found] #{@command.path}"
|
48
|
+
else
|
49
|
+
"#{log_tag}[command][not-found]"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def target?(data)
|
55
|
+
return false if @command.nil?
|
56
|
+
EXTENSIONS.include?(data.extension) or
|
57
|
+
MIME_TYPES.include?(data.mime_type)
|
58
|
+
end
|
59
|
+
|
60
|
+
def decompose(data)
|
61
|
+
pdf_data = convert_to_pdf(data)
|
62
|
+
return if pdf_data.nil?
|
63
|
+
yield(pdf_data)
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def find_command
|
68
|
+
candidates = [
|
69
|
+
@options[:abiword],
|
70
|
+
ENV["ABIWORD"],
|
71
|
+
"abiword",
|
72
|
+
]
|
73
|
+
candidates.each do |candidate|
|
74
|
+
next if candidate.nil?
|
75
|
+
command = ExternalCommand.new(candidate)
|
76
|
+
return command if command.exist?
|
77
|
+
end
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
|
81
|
+
def convert_to_pdf(data)
|
82
|
+
create_tempfiles(data) do |pdf, stdout, stderr|
|
83
|
+
succeeded = @command.run("--to", "pdf",
|
84
|
+
"--to-name", pdf.path,
|
85
|
+
data.path.to_s,
|
86
|
+
{
|
87
|
+
data: data,
|
88
|
+
spawn_options: {
|
89
|
+
out: stdout.path,
|
90
|
+
err: stderr.path,
|
91
|
+
},
|
92
|
+
})
|
93
|
+
unless succeeded
|
94
|
+
error do
|
95
|
+
tag = "#{log_tag}[convert][exited][abnormally]"
|
96
|
+
[
|
97
|
+
tag,
|
98
|
+
"output: <#{stdout.read}>",
|
99
|
+
"error: <#{stderr.read}>",
|
100
|
+
].join("\n")
|
101
|
+
end
|
102
|
+
return nil
|
103
|
+
end
|
104
|
+
normalized_pdf_uri = data.uri.to_s.gsub(/\.[^.]+\z/, ".pdf")
|
105
|
+
File.open(pdf.path, "rb") do |pdf_input|
|
106
|
+
VirtualFileData.new(normalized_pdf_uri,
|
107
|
+
pdf_input,
|
108
|
+
source_data: data)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def create_tempfiles(data)
|
114
|
+
basename = File.basename(data.path)
|
115
|
+
pdf = Tempfile.new([basename, ".pdf"])
|
116
|
+
stdout = Tempfile.new([basename, ".stdout.log"])
|
117
|
+
stderr = Tempfile.new([basename, ".stderr.log"])
|
118
|
+
begin
|
119
|
+
yield(pdf, stdout, stderr)
|
120
|
+
ensure
|
121
|
+
pdf.close!
|
122
|
+
stdout.close!
|
123
|
+
stderr.close!
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def log_tag
|
128
|
+
"[decomposer][abiword]"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
|
3
|
+
<abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="4" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
|
4
|
+
<!-- ======================================================================== -->
|
5
|
+
<!-- This file is an AbiWord document. -->
|
6
|
+
<!-- AbiWord is a free, Open Source word processor. -->
|
7
|
+
<!-- More information about AbiWord is available at http://www.abisource.com/ -->
|
8
|
+
<!-- You should not edit this file by hand. -->
|
9
|
+
<!-- ======================================================================== -->
|
10
|
+
|
11
|
+
<metadata>
|
12
|
+
<m key="abiword.date_last_changed">Thu Jun 13 16:15:17 2019
|
13
|
+
</m>
|
14
|
+
<m key="abiword.generator">AbiWord</m>
|
15
|
+
<m key="dc.date">Thu Jun 13 16:15:17 2019
|
16
|
+
</m>
|
17
|
+
<m key="dc.format">application/x-abiword</m>
|
18
|
+
<m key="meta:editing-cycles">1</m>
|
19
|
+
<m key="meta:editing-duration">P0D</m>
|
20
|
+
</metadata>
|
21
|
+
<rdf>
|
22
|
+
<t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
|
23
|
+
<t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
|
24
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
|
25
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
|
26
|
+
<t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
|
27
|
+
</rdf>
|
28
|
+
<history version="1" edit-time="16" last-saved="1560410117" uid="f5d2a67c-8daa-11e9-84b7-a514b42a119c">
|
29
|
+
<version id="1" started="1560410117" uid="ff487a92-8daa-11e9-84b7-a514b42a119c" auto="0" top-xid="4"/>
|
30
|
+
</history>
|
31
|
+
<styles>
|
32
|
+
<s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
|
33
|
+
<s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
|
34
|
+
<s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
|
35
|
+
<s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
|
36
|
+
</styles>
|
37
|
+
<pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
|
38
|
+
<section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
|
39
|
+
<p style="Normal" xid="2">Page1</p>
|
40
|
+
<p xid="3"><pbr/></p>
|
41
|
+
<p style="Normal" props="" xid="4">Page2</p>
|
42
|
+
</section>
|
43
|
+
</abiword>
|
@@ -0,0 +1,41 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
|
3
|
+
<abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="2" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
|
4
|
+
<!-- ======================================================================== -->
|
5
|
+
<!-- This file is an AbiWord document. -->
|
6
|
+
<!-- AbiWord is a free, Open Source word processor. -->
|
7
|
+
<!-- More information about AbiWord is available at http://www.abisource.com/ -->
|
8
|
+
<!-- You should not edit this file by hand. -->
|
9
|
+
<!-- ======================================================================== -->
|
10
|
+
|
11
|
+
<metadata>
|
12
|
+
<m key="abiword.date_last_changed">Thu Jun 13 16:15:31 2019
|
13
|
+
</m>
|
14
|
+
<m key="abiword.generator">AbiWord</m>
|
15
|
+
<m key="dc.date">Thu Jun 13 16:15:31 2019
|
16
|
+
</m>
|
17
|
+
<m key="dc.format">application/x-abiword</m>
|
18
|
+
<m key="meta:editing-cycles">1</m>
|
19
|
+
<m key="meta:editing-duration">P0D</m>
|
20
|
+
</metadata>
|
21
|
+
<rdf>
|
22
|
+
<t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
|
23
|
+
<t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
|
24
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
|
25
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
|
26
|
+
<t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
|
27
|
+
</rdf>
|
28
|
+
<history version="1" edit-time="7" last-saved="1560410131" uid="03403cde-8dab-11e9-9847-bcd35e03657c">
|
29
|
+
<version id="1" started="1560410131" uid="07b66e0a-8dab-11e9-9847-bcd35e03657c" auto="0" top-xid="2"/>
|
30
|
+
</history>
|
31
|
+
<styles>
|
32
|
+
<s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
|
33
|
+
<s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
|
34
|
+
<s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
|
35
|
+
<s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
|
36
|
+
</styles>
|
37
|
+
<pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
|
38
|
+
<section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
|
39
|
+
<p style="Normal" xid="2">Page1</p>
|
40
|
+
</section>
|
41
|
+
</abiword>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,19 @@
|
|
1
|
+
{\rtf1\ansi\deff3\adeflang1025
|
2
|
+
{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
|
3
|
+
{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
|
4
|
+
{\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
|
5
|
+
{\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
|
6
|
+
{\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
|
7
|
+
{\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
|
8
|
+
{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
|
9
|
+
{\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
|
10
|
+
}{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min35}{\revtim\yr2014\mo1\dy5\hr15\min36}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
|
11
|
+
\viewscale110
|
12
|
+
{\*\pgdsctbl
|
13
|
+
{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
|
14
|
+
\formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
15
|
+
{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
|
16
|
+
Page1}
|
17
|
+
\par \pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033\pagebb{\rtlch \ltrch\loch
|
18
|
+
Page2}
|
19
|
+
\par }
|
@@ -0,0 +1,17 @@
|
|
1
|
+
{\rtf1\ansi\deff3\adeflang1025
|
2
|
+
{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
|
3
|
+
{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
|
4
|
+
{\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
|
5
|
+
{\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
|
6
|
+
{\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
|
7
|
+
{\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
|
8
|
+
{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
|
9
|
+
{\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
|
10
|
+
}{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min34}{\revtim\yr2014\mo1\dy5\hr15\min35}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
|
11
|
+
\viewscale110
|
12
|
+
{\*\pgdsctbl
|
13
|
+
{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
|
14
|
+
\formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
15
|
+
{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
|
16
|
+
Page1}
|
17
|
+
\par }
|
Binary file
|
Binary file
|
data/test/helper.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "pathname"
|
18
|
+
|
19
|
+
module FixtureHelper
|
20
|
+
def fixture_path(*components)
|
21
|
+
base_path = Pathname(__dir__) + "fixture"
|
22
|
+
base_path.join(*components)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
module DecomposeHelper
|
27
|
+
def decompose(path)
|
28
|
+
data = ChupaText::InputData.new(path)
|
29
|
+
|
30
|
+
pdf_decomposer = ChupaText::Decomposers::PDF.new({})
|
31
|
+
decomposed = []
|
32
|
+
@decomposer.decompose(data) do |decomposed_data|
|
33
|
+
if pdf_decomposer.target?(decomposed_data)
|
34
|
+
pdf_decomposer.decompose(decomposed_data) do |pdf_decomposed_data|
|
35
|
+
decomposed << pdf_decomposed_data
|
36
|
+
end
|
37
|
+
else
|
38
|
+
decomposed << decomposed_data
|
39
|
+
end
|
40
|
+
end
|
41
|
+
decomposed
|
42
|
+
end
|
43
|
+
|
44
|
+
def normalize_producers(producers)
|
45
|
+
producers.collect do |producer|
|
46
|
+
normalize_producer(producer)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def normalize_producer(producer)
|
51
|
+
if /\Acairo \d+\.\d+\.\d+ \(https:\/\/cairographics\.org\)\z/ =~ producer
|
52
|
+
"cairo"
|
53
|
+
else
|
54
|
+
producer
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|