chupa-text-decomposer-abiword 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # README
2
+
3
+ ## Name
4
+
5
+ chupa-text-decomposer-abiword
6
+
7
+ ## Description
8
+
9
+ This is a ChupaText decomposer plugin for to extract text and
10
+ meta-data from office documents such as Microsoft Word files and
11
+ LibreOffice Writer files.
12
+
13
+ You can use `abiword` decomposer.
14
+
15
+ ## Install
16
+
17
+ Install chupa-text-decomposer-abiword gem:
18
+
19
+ ```
20
+ % gem install chupa-text-decomposer-abiword
21
+ ```
22
+
23
+ Now, you can extract text and meta-data from office documents:
24
+
25
+ ```
26
+ % chupa-text document.doc
27
+ ```
28
+
29
+ ## Author
30
+
31
+ * Sutou Kouhei `<kou@clear-code.com>`
32
+
33
+ ## License
34
+
35
+ LGPL 2.1 or later.
36
+
37
+ (Sutou Kouhei has a right to change the license including contributed
38
+ patches.)
data/Rakefile ADDED
@@ -0,0 +1,48 @@
1
+ # -*- ruby -*-
2
+ #
3
+ # Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ task :default => :test
20
+
21
+ require "pathname"
22
+
23
+ require "rubygems"
24
+ require "bundler/gem_helper"
25
+ require "packnga"
26
+
27
+ base_dir = Pathname(__FILE__).dirname
28
+
29
+ helper = Bundler::GemHelper.new(base_dir.to_s)
30
+ def helper.version_tag
31
+ version
32
+ end
33
+
34
+ helper.install
35
+ spec = helper.gemspec
36
+
37
+ Packnga::DocumentTask.new(spec) do |task|
38
+ task.original_language = "en"
39
+ task.translate_language = "ja"
40
+ end
41
+
42
+ Packnga::ReleaseTask.new(spec) do
43
+ end
44
+
45
+ desc "Run tests"
46
+ task :test do
47
+ ruby("test/run-test.rb")
48
+ end
@@ -0,0 +1,50 @@
1
+ # -*- ruby -*-
2
+ #
3
+ # Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ clean_white_space = lambda do |entry|
20
+ entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
21
+ end
22
+
23
+ Gem::Specification.new do |spec|
24
+ spec.name = "chupa-text-decomposer-abiword"
25
+ spec.version = "1.0.0"
26
+ spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-abiword"
27
+ spec.authors = ["Sutou Kouhei"]
28
+ spec.email = ["kou@clear-code.com"]
29
+ readme = File.read("README.md", encoding: "UTF-8")
30
+ entries = readme.split(/^\#\#\s(.*)$/)
31
+ description = clean_white_space.call(entries[entries.index("Description") + 1])
32
+ spec.summary = description.split(/\n\n+/, 2).first
33
+ spec.description = description
34
+ spec.license = "LGPL-2.1+"
35
+ spec.files = ["#{spec.name}.gemspec"]
36
+ spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
37
+ spec.files += [".yardopts"]
38
+ spec.files += Dir.glob("lib/**/*.rb")
39
+ spec.files += Dir.glob("doc/text/*")
40
+ spec.files += Dir.glob("test/**/*")
41
+
42
+ spec.add_runtime_dependency("chupa-text")
43
+ spec.add_runtime_dependency("chupa-text-decomposer-pdf")
44
+
45
+ spec.add_development_dependency("bundler")
46
+ spec.add_development_dependency("rake")
47
+ spec.add_development_dependency("test-unit")
48
+ spec.add_development_dependency("packnga")
49
+ spec.add_development_dependency("kramdown")
50
+ end
data/doc/text/news.md ADDED
@@ -0,0 +1,5 @@
1
+ # News
2
+
3
+ ## 1.0.0: 2019-06-13
4
+
5
+ The first release!!!
@@ -0,0 +1,132 @@
1
+ # Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "tempfile"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class AbiWord < Decomposer
22
+ include Loggable
23
+
24
+ registry.register("abiword", self)
25
+
26
+ EXTENSIONS = [
27
+ "abw",
28
+ "doc",
29
+ "docx",
30
+ "odt",
31
+ "rtf",
32
+ "zabw",
33
+ ]
34
+ MIME_TYPES = [
35
+ "application/msword",
36
+ "application/rtf",
37
+ "application/vnd.oasis.opendocument.text",
38
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
39
+ "application/x-abiword",
40
+ ]
41
+
42
+ def initialize(options)
43
+ super
44
+ @command = find_command
45
+ debug do
46
+ if @command
47
+ "#{log_tag}[command][found] #{@command.path}"
48
+ else
49
+ "#{log_tag}[command][not-found]"
50
+ end
51
+ end
52
+ end
53
+
54
+ def target?(data)
55
+ return false if @command.nil?
56
+ EXTENSIONS.include?(data.extension) or
57
+ MIME_TYPES.include?(data.mime_type)
58
+ end
59
+
60
+ def decompose(data)
61
+ pdf_data = convert_to_pdf(data)
62
+ return if pdf_data.nil?
63
+ yield(pdf_data)
64
+ end
65
+
66
+ private
67
+ def find_command
68
+ candidates = [
69
+ @options[:abiword],
70
+ ENV["ABIWORD"],
71
+ "abiword",
72
+ ]
73
+ candidates.each do |candidate|
74
+ next if candidate.nil?
75
+ command = ExternalCommand.new(candidate)
76
+ return command if command.exist?
77
+ end
78
+ nil
79
+ end
80
+
81
+ def convert_to_pdf(data)
82
+ create_tempfiles(data) do |pdf, stdout, stderr|
83
+ succeeded = @command.run("--to", "pdf",
84
+ "--to-name", pdf.path,
85
+ data.path.to_s,
86
+ {
87
+ data: data,
88
+ spawn_options: {
89
+ out: stdout.path,
90
+ err: stderr.path,
91
+ },
92
+ })
93
+ unless succeeded
94
+ error do
95
+ tag = "#{log_tag}[convert][exited][abnormally]"
96
+ [
97
+ tag,
98
+ "output: <#{stdout.read}>",
99
+ "error: <#{stderr.read}>",
100
+ ].join("\n")
101
+ end
102
+ return nil
103
+ end
104
+ normalized_pdf_uri = data.uri.to_s.gsub(/\.[^.]+\z/, ".pdf")
105
+ File.open(pdf.path, "rb") do |pdf_input|
106
+ VirtualFileData.new(normalized_pdf_uri,
107
+ pdf_input,
108
+ source_data: data)
109
+ end
110
+ end
111
+ end
112
+
113
+ def create_tempfiles(data)
114
+ basename = File.basename(data.path)
115
+ pdf = Tempfile.new([basename, ".pdf"])
116
+ stdout = Tempfile.new([basename, ".stdout.log"])
117
+ stderr = Tempfile.new([basename, ".stderr.log"])
118
+ begin
119
+ yield(pdf, stdout, stderr)
120
+ ensure
121
+ pdf.close!
122
+ stdout.close!
123
+ stderr.close!
124
+ end
125
+ end
126
+
127
+ def log_tag
128
+ "[decomposer][abiword]"
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,43 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
3
+ <abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="4" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
4
+ <!-- ======================================================================== -->
5
+ <!-- This file is an AbiWord document. -->
6
+ <!-- AbiWord is a free, Open Source word processor. -->
7
+ <!-- More information about AbiWord is available at http://www.abisource.com/ -->
8
+ <!-- You should not edit this file by hand. -->
9
+ <!-- ======================================================================== -->
10
+
11
+ <metadata>
12
+ <m key="abiword.date_last_changed">Thu Jun 13 16:15:17 2019
13
+ </m>
14
+ <m key="abiword.generator">AbiWord</m>
15
+ <m key="dc.date">Thu Jun 13 16:15:17 2019
16
+ </m>
17
+ <m key="dc.format">application/x-abiword</m>
18
+ <m key="meta:editing-cycles">1</m>
19
+ <m key="meta:editing-duration">P0D</m>
20
+ </metadata>
21
+ <rdf>
22
+ <t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
23
+ <t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
24
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
25
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
26
+ <t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
27
+ </rdf>
28
+ <history version="1" edit-time="16" last-saved="1560410117" uid="f5d2a67c-8daa-11e9-84b7-a514b42a119c">
29
+ <version id="1" started="1560410117" uid="ff487a92-8daa-11e9-84b7-a514b42a119c" auto="0" top-xid="4"/>
30
+ </history>
31
+ <styles>
32
+ <s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
33
+ <s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
34
+ <s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
35
+ <s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
36
+ </styles>
37
+ <pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
38
+ <section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
39
+ <p style="Normal" xid="2">Page1</p>
40
+ <p xid="3"><pbr/></p>
41
+ <p style="Normal" props="" xid="4">Page2</p>
42
+ </section>
43
+ </abiword>
@@ -0,0 +1,41 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
3
+ <abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="2" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
4
+ <!-- ======================================================================== -->
5
+ <!-- This file is an AbiWord document. -->
6
+ <!-- AbiWord is a free, Open Source word processor. -->
7
+ <!-- More information about AbiWord is available at http://www.abisource.com/ -->
8
+ <!-- You should not edit this file by hand. -->
9
+ <!-- ======================================================================== -->
10
+
11
+ <metadata>
12
+ <m key="abiword.date_last_changed">Thu Jun 13 16:15:31 2019
13
+ </m>
14
+ <m key="abiword.generator">AbiWord</m>
15
+ <m key="dc.date">Thu Jun 13 16:15:31 2019
16
+ </m>
17
+ <m key="dc.format">application/x-abiword</m>
18
+ <m key="meta:editing-cycles">1</m>
19
+ <m key="meta:editing-duration">P0D</m>
20
+ </metadata>
21
+ <rdf>
22
+ <t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
23
+ <t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
24
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
25
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
26
+ <t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
27
+ </rdf>
28
+ <history version="1" edit-time="7" last-saved="1560410131" uid="03403cde-8dab-11e9-9847-bcd35e03657c">
29
+ <version id="1" started="1560410131" uid="07b66e0a-8dab-11e9-9847-bcd35e03657c" auto="0" top-xid="2"/>
30
+ </history>
31
+ <styles>
32
+ <s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
33
+ <s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
34
+ <s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
35
+ <s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
36
+ </styles>
37
+ <pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
38
+ <section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
39
+ <p style="Normal" xid="2">Page1</p>
40
+ </section>
41
+ </abiword>
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,19 @@
1
+ {\rtf1\ansi\deff3\adeflang1025
2
+ {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
3
+ {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
4
+ {\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
5
+ {\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
6
+ {\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
7
+ {\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
8
+ {\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
9
+ {\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
10
+ }{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min35}{\revtim\yr2014\mo1\dy5\hr15\min36}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
11
+ \viewscale110
12
+ {\*\pgdsctbl
13
+ {\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
14
+ \formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
15
+ {\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
16
+ Page1}
17
+ \par \pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033\pagebb{\rtlch \ltrch\loch
18
+ Page2}
19
+ \par }
@@ -0,0 +1,17 @@
1
+ {\rtf1\ansi\deff3\adeflang1025
2
+ {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
3
+ {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
4
+ {\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
5
+ {\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
6
+ {\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
7
+ {\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
8
+ {\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
9
+ {\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
10
+ }{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min34}{\revtim\yr2014\mo1\dy5\hr15\min35}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
11
+ \viewscale110
12
+ {\*\pgdsctbl
13
+ {\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
14
+ \formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
15
+ {\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
16
+ Page1}
17
+ \par }
Binary file
Binary file
data/test/helper.rb ADDED
@@ -0,0 +1,57 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+
19
+ module FixtureHelper
20
+ def fixture_path(*components)
21
+ base_path = Pathname(__dir__) + "fixture"
22
+ base_path.join(*components)
23
+ end
24
+ end
25
+
26
+ module DecomposeHelper
27
+ def decompose(path)
28
+ data = ChupaText::InputData.new(path)
29
+
30
+ pdf_decomposer = ChupaText::Decomposers::PDF.new({})
31
+ decomposed = []
32
+ @decomposer.decompose(data) do |decomposed_data|
33
+ if pdf_decomposer.target?(decomposed_data)
34
+ pdf_decomposer.decompose(decomposed_data) do |pdf_decomposed_data|
35
+ decomposed << pdf_decomposed_data
36
+ end
37
+ else
38
+ decomposed << decomposed_data
39
+ end
40
+ end
41
+ decomposed
42
+ end
43
+
44
+ def normalize_producers(producers)
45
+ producers.collect do |producer|
46
+ normalize_producer(producer)
47
+ end
48
+ end
49
+
50
+ def normalize_producer(producer)
51
+ if /\Acairo \d+\.\d+\.\d+ \(https:\/\/cairographics\.org\)\z/ =~ producer
52
+ "cairo"
53
+ else
54
+ producer
55
+ end
56
+ end
57
+ end