chupa-text-decomposer-abiword 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +5 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +502 -0
- data/README.md +38 -0
- data/Rakefile +48 -0
- data/chupa-text-decomposer-abiword.gemspec +50 -0
- data/doc/text/news.md +5 -0
- data/lib/chupa-text/decomposers/abiword.rb +132 -0
- data/test/fixture/abw/multi-pages.abw +43 -0
- data/test/fixture/abw/one-page.abw +41 -0
- data/test/fixture/doc/multi-pages.doc +0 -0
- data/test/fixture/doc/one-page.doc +0 -0
- data/test/fixture/docx/multi-pages.docx +0 -0
- data/test/fixture/docx/one-page.docx +0 -0
- data/test/fixture/odt/multi-pages.odt +0 -0
- data/test/fixture/odt/one-page.odt +0 -0
- data/test/fixture/rtf/multi-pages.rtf +19 -0
- data/test/fixture/rtf/one-page.rtf +17 -0
- data/test/fixture/zabw/multi-pages.zabw +0 -0
- data/test/fixture/zabw/one-page.zabw +0 -0
- data/test/helper.rb +57 -0
- data/test/run-test.rb +31 -0
- data/test/test-abw.rb +84 -0
- data/test/test-doc.rb +84 -0
- data/test/test-docx.rb +84 -0
- data/test/test-odt.rb +84 -0
- data/test/test-rtf.rb +84 -0
- data/test/test-zabw.rb +71 -0
- metadata +176 -0
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# README
|
2
|
+
|
3
|
+
## Name
|
4
|
+
|
5
|
+
chupa-text-decomposer-abiword
|
6
|
+
|
7
|
+
## Description
|
8
|
+
|
9
|
+
This is a ChupaText decomposer plugin for to extract text and
|
10
|
+
meta-data from office documents such as Microsoft Word files and
|
11
|
+
LibreOffice Writer files.
|
12
|
+
|
13
|
+
You can use `abiword` decomposer.
|
14
|
+
|
15
|
+
## Install
|
16
|
+
|
17
|
+
Install chupa-text-decomposer-abiword gem:
|
18
|
+
|
19
|
+
```
|
20
|
+
% gem install chupa-text-decomposer-abiword
|
21
|
+
```
|
22
|
+
|
23
|
+
Now, you can extract text and meta-data from office documents:
|
24
|
+
|
25
|
+
```
|
26
|
+
% chupa-text document.doc
|
27
|
+
```
|
28
|
+
|
29
|
+
## Author
|
30
|
+
|
31
|
+
* Sutou Kouhei `<kou@clear-code.com>`
|
32
|
+
|
33
|
+
## License
|
34
|
+
|
35
|
+
LGPL 2.1 or later.
|
36
|
+
|
37
|
+
(Sutou Kouhei has a right to change the license including contributed
|
38
|
+
patches.)
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
#
|
3
|
+
# Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
|
4
|
+
#
|
5
|
+
# This library is free software; you can redistribute it and/or
|
6
|
+
# modify it under the terms of the GNU Lesser General Public
|
7
|
+
# License as published by the Free Software Foundation; either
|
8
|
+
# version 2.1 of the License, or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This library is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
# Lesser General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU Lesser General Public
|
16
|
+
# License along with this library; if not, write to the Free Software
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
|
19
|
+
task :default => :test
|
20
|
+
|
21
|
+
require "pathname"
|
22
|
+
|
23
|
+
require "rubygems"
|
24
|
+
require "bundler/gem_helper"
|
25
|
+
require "packnga"
|
26
|
+
|
27
|
+
base_dir = Pathname(__FILE__).dirname
|
28
|
+
|
29
|
+
helper = Bundler::GemHelper.new(base_dir.to_s)
|
30
|
+
def helper.version_tag
|
31
|
+
version
|
32
|
+
end
|
33
|
+
|
34
|
+
helper.install
|
35
|
+
spec = helper.gemspec
|
36
|
+
|
37
|
+
Packnga::DocumentTask.new(spec) do |task|
|
38
|
+
task.original_language = "en"
|
39
|
+
task.translate_language = "ja"
|
40
|
+
end
|
41
|
+
|
42
|
+
Packnga::ReleaseTask.new(spec) do
|
43
|
+
end
|
44
|
+
|
45
|
+
desc "Run tests"
|
46
|
+
task :test do
|
47
|
+
ruby("test/run-test.rb")
|
48
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
#
|
3
|
+
# Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
|
4
|
+
#
|
5
|
+
# This library is free software; you can redistribute it and/or
|
6
|
+
# modify it under the terms of the GNU Lesser General Public
|
7
|
+
# License as published by the Free Software Foundation; either
|
8
|
+
# version 2.1 of the License, or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This library is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
# Lesser General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU Lesser General Public
|
16
|
+
# License along with this library; if not, write to the Free Software
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
|
19
|
+
clean_white_space = lambda do |entry|
|
20
|
+
entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
|
21
|
+
end
|
22
|
+
|
23
|
+
Gem::Specification.new do |spec|
|
24
|
+
spec.name = "chupa-text-decomposer-abiword"
|
25
|
+
spec.version = "1.0.0"
|
26
|
+
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-abiword"
|
27
|
+
spec.authors = ["Sutou Kouhei"]
|
28
|
+
spec.email = ["kou@clear-code.com"]
|
29
|
+
readme = File.read("README.md", encoding: "UTF-8")
|
30
|
+
entries = readme.split(/^\#\#\s(.*)$/)
|
31
|
+
description = clean_white_space.call(entries[entries.index("Description") + 1])
|
32
|
+
spec.summary = description.split(/\n\n+/, 2).first
|
33
|
+
spec.description = description
|
34
|
+
spec.license = "LGPL-2.1+"
|
35
|
+
spec.files = ["#{spec.name}.gemspec"]
|
36
|
+
spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
|
37
|
+
spec.files += [".yardopts"]
|
38
|
+
spec.files += Dir.glob("lib/**/*.rb")
|
39
|
+
spec.files += Dir.glob("doc/text/*")
|
40
|
+
spec.files += Dir.glob("test/**/*")
|
41
|
+
|
42
|
+
spec.add_runtime_dependency("chupa-text")
|
43
|
+
spec.add_runtime_dependency("chupa-text-decomposer-pdf")
|
44
|
+
|
45
|
+
spec.add_development_dependency("bundler")
|
46
|
+
spec.add_development_dependency("rake")
|
47
|
+
spec.add_development_dependency("test-unit")
|
48
|
+
spec.add_development_dependency("packnga")
|
49
|
+
spec.add_development_dependency("kramdown")
|
50
|
+
end
|
data/doc/text/news.md
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
# Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "tempfile"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Decomposers
|
21
|
+
class AbiWord < Decomposer
|
22
|
+
include Loggable
|
23
|
+
|
24
|
+
registry.register("abiword", self)
|
25
|
+
|
26
|
+
EXTENSIONS = [
|
27
|
+
"abw",
|
28
|
+
"doc",
|
29
|
+
"docx",
|
30
|
+
"odt",
|
31
|
+
"rtf",
|
32
|
+
"zabw",
|
33
|
+
]
|
34
|
+
MIME_TYPES = [
|
35
|
+
"application/msword",
|
36
|
+
"application/rtf",
|
37
|
+
"application/vnd.oasis.opendocument.text",
|
38
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
39
|
+
"application/x-abiword",
|
40
|
+
]
|
41
|
+
|
42
|
+
def initialize(options)
|
43
|
+
super
|
44
|
+
@command = find_command
|
45
|
+
debug do
|
46
|
+
if @command
|
47
|
+
"#{log_tag}[command][found] #{@command.path}"
|
48
|
+
else
|
49
|
+
"#{log_tag}[command][not-found]"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def target?(data)
|
55
|
+
return false if @command.nil?
|
56
|
+
EXTENSIONS.include?(data.extension) or
|
57
|
+
MIME_TYPES.include?(data.mime_type)
|
58
|
+
end
|
59
|
+
|
60
|
+
def decompose(data)
|
61
|
+
pdf_data = convert_to_pdf(data)
|
62
|
+
return if pdf_data.nil?
|
63
|
+
yield(pdf_data)
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def find_command
|
68
|
+
candidates = [
|
69
|
+
@options[:abiword],
|
70
|
+
ENV["ABIWORD"],
|
71
|
+
"abiword",
|
72
|
+
]
|
73
|
+
candidates.each do |candidate|
|
74
|
+
next if candidate.nil?
|
75
|
+
command = ExternalCommand.new(candidate)
|
76
|
+
return command if command.exist?
|
77
|
+
end
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
|
81
|
+
def convert_to_pdf(data)
|
82
|
+
create_tempfiles(data) do |pdf, stdout, stderr|
|
83
|
+
succeeded = @command.run("--to", "pdf",
|
84
|
+
"--to-name", pdf.path,
|
85
|
+
data.path.to_s,
|
86
|
+
{
|
87
|
+
data: data,
|
88
|
+
spawn_options: {
|
89
|
+
out: stdout.path,
|
90
|
+
err: stderr.path,
|
91
|
+
},
|
92
|
+
})
|
93
|
+
unless succeeded
|
94
|
+
error do
|
95
|
+
tag = "#{log_tag}[convert][exited][abnormally]"
|
96
|
+
[
|
97
|
+
tag,
|
98
|
+
"output: <#{stdout.read}>",
|
99
|
+
"error: <#{stderr.read}>",
|
100
|
+
].join("\n")
|
101
|
+
end
|
102
|
+
return nil
|
103
|
+
end
|
104
|
+
normalized_pdf_uri = data.uri.to_s.gsub(/\.[^.]+\z/, ".pdf")
|
105
|
+
File.open(pdf.path, "rb") do |pdf_input|
|
106
|
+
VirtualFileData.new(normalized_pdf_uri,
|
107
|
+
pdf_input,
|
108
|
+
source_data: data)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def create_tempfiles(data)
|
114
|
+
basename = File.basename(data.path)
|
115
|
+
pdf = Tempfile.new([basename, ".pdf"])
|
116
|
+
stdout = Tempfile.new([basename, ".stdout.log"])
|
117
|
+
stderr = Tempfile.new([basename, ".stderr.log"])
|
118
|
+
begin
|
119
|
+
yield(pdf, stdout, stderr)
|
120
|
+
ensure
|
121
|
+
pdf.close!
|
122
|
+
stdout.close!
|
123
|
+
stderr.close!
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def log_tag
|
128
|
+
"[decomposer][abiword]"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
|
3
|
+
<abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="4" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
|
4
|
+
<!-- ======================================================================== -->
|
5
|
+
<!-- This file is an AbiWord document. -->
|
6
|
+
<!-- AbiWord is a free, Open Source word processor. -->
|
7
|
+
<!-- More information about AbiWord is available at http://www.abisource.com/ -->
|
8
|
+
<!-- You should not edit this file by hand. -->
|
9
|
+
<!-- ======================================================================== -->
|
10
|
+
|
11
|
+
<metadata>
|
12
|
+
<m key="abiword.date_last_changed">Thu Jun 13 16:15:17 2019
|
13
|
+
</m>
|
14
|
+
<m key="abiword.generator">AbiWord</m>
|
15
|
+
<m key="dc.date">Thu Jun 13 16:15:17 2019
|
16
|
+
</m>
|
17
|
+
<m key="dc.format">application/x-abiword</m>
|
18
|
+
<m key="meta:editing-cycles">1</m>
|
19
|
+
<m key="meta:editing-duration">P0D</m>
|
20
|
+
</metadata>
|
21
|
+
<rdf>
|
22
|
+
<t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
|
23
|
+
<t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
|
24
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
|
25
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
|
26
|
+
<t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
|
27
|
+
</rdf>
|
28
|
+
<history version="1" edit-time="16" last-saved="1560410117" uid="f5d2a67c-8daa-11e9-84b7-a514b42a119c">
|
29
|
+
<version id="1" started="1560410117" uid="ff487a92-8daa-11e9-84b7-a514b42a119c" auto="0" top-xid="4"/>
|
30
|
+
</history>
|
31
|
+
<styles>
|
32
|
+
<s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
|
33
|
+
<s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
|
34
|
+
<s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
|
35
|
+
<s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
|
36
|
+
</styles>
|
37
|
+
<pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
|
38
|
+
<section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
|
39
|
+
<p style="Normal" xid="2">Page1</p>
|
40
|
+
<p xid="3"><pbr/></p>
|
41
|
+
<p style="Normal" props="" xid="4">Page2</p>
|
42
|
+
</section>
|
43
|
+
</abiword>
|
@@ -0,0 +1,41 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
|
3
|
+
<abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="2" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
|
4
|
+
<!-- ======================================================================== -->
|
5
|
+
<!-- This file is an AbiWord document. -->
|
6
|
+
<!-- AbiWord is a free, Open Source word processor. -->
|
7
|
+
<!-- More information about AbiWord is available at http://www.abisource.com/ -->
|
8
|
+
<!-- You should not edit this file by hand. -->
|
9
|
+
<!-- ======================================================================== -->
|
10
|
+
|
11
|
+
<metadata>
|
12
|
+
<m key="abiword.date_last_changed">Thu Jun 13 16:15:31 2019
|
13
|
+
</m>
|
14
|
+
<m key="abiword.generator">AbiWord</m>
|
15
|
+
<m key="dc.date">Thu Jun 13 16:15:31 2019
|
16
|
+
</m>
|
17
|
+
<m key="dc.format">application/x-abiword</m>
|
18
|
+
<m key="meta:editing-cycles">1</m>
|
19
|
+
<m key="meta:editing-duration">P0D</m>
|
20
|
+
</metadata>
|
21
|
+
<rdf>
|
22
|
+
<t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
|
23
|
+
<t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
|
24
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
|
25
|
+
<t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
|
26
|
+
<t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
|
27
|
+
</rdf>
|
28
|
+
<history version="1" edit-time="7" last-saved="1560410131" uid="03403cde-8dab-11e9-9847-bcd35e03657c">
|
29
|
+
<version id="1" started="1560410131" uid="07b66e0a-8dab-11e9-9847-bcd35e03657c" auto="0" top-xid="2"/>
|
30
|
+
</history>
|
31
|
+
<styles>
|
32
|
+
<s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
|
33
|
+
<s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
|
34
|
+
<s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
|
35
|
+
<s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
|
36
|
+
</styles>
|
37
|
+
<pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
|
38
|
+
<section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
|
39
|
+
<p style="Normal" xid="2">Page1</p>
|
40
|
+
</section>
|
41
|
+
</abiword>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,19 @@
|
|
1
|
+
{\rtf1\ansi\deff3\adeflang1025
|
2
|
+
{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
|
3
|
+
{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
|
4
|
+
{\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
|
5
|
+
{\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
|
6
|
+
{\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
|
7
|
+
{\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
|
8
|
+
{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
|
9
|
+
{\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
|
10
|
+
}{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min35}{\revtim\yr2014\mo1\dy5\hr15\min36}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
|
11
|
+
\viewscale110
|
12
|
+
{\*\pgdsctbl
|
13
|
+
{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
|
14
|
+
\formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
15
|
+
{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
|
16
|
+
Page1}
|
17
|
+
\par \pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033\pagebb{\rtlch \ltrch\loch
|
18
|
+
Page2}
|
19
|
+
\par }
|
@@ -0,0 +1,17 @@
|
|
1
|
+
{\rtf1\ansi\deff3\adeflang1025
|
2
|
+
{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
|
3
|
+
{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
|
4
|
+
{\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
|
5
|
+
{\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
|
6
|
+
{\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
|
7
|
+
{\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
|
8
|
+
{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
|
9
|
+
{\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
|
10
|
+
}{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min34}{\revtim\yr2014\mo1\dy5\hr15\min35}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
|
11
|
+
\viewscale110
|
12
|
+
{\*\pgdsctbl
|
13
|
+
{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
|
14
|
+
\formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
15
|
+
{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
|
16
|
+
Page1}
|
17
|
+
\par }
|
Binary file
|
Binary file
|
data/test/helper.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "pathname"
|
18
|
+
|
19
|
+
module FixtureHelper
|
20
|
+
def fixture_path(*components)
|
21
|
+
base_path = Pathname(__dir__) + "fixture"
|
22
|
+
base_path.join(*components)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
module DecomposeHelper
|
27
|
+
def decompose(path)
|
28
|
+
data = ChupaText::InputData.new(path)
|
29
|
+
|
30
|
+
pdf_decomposer = ChupaText::Decomposers::PDF.new({})
|
31
|
+
decomposed = []
|
32
|
+
@decomposer.decompose(data) do |decomposed_data|
|
33
|
+
if pdf_decomposer.target?(decomposed_data)
|
34
|
+
pdf_decomposer.decompose(decomposed_data) do |pdf_decomposed_data|
|
35
|
+
decomposed << pdf_decomposed_data
|
36
|
+
end
|
37
|
+
else
|
38
|
+
decomposed << decomposed_data
|
39
|
+
end
|
40
|
+
end
|
41
|
+
decomposed
|
42
|
+
end
|
43
|
+
|
44
|
+
def normalize_producers(producers)
|
45
|
+
producers.collect do |producer|
|
46
|
+
normalize_producer(producer)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def normalize_producer(producer)
|
51
|
+
if /\Acairo \d+\.\d+\.\d+ \(https:\/\/cairographics\.org\)\z/ =~ producer
|
52
|
+
"cairo"
|
53
|
+
else
|
54
|
+
producer
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|