chupa-text-decomposer-abiword 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # README
2
+
3
+ ## Name
4
+
5
+ chupa-text-decomposer-abiword
6
+
7
+ ## Description
8
+
9
+ This is a ChupaText decomposer plugin for to extract text and
10
+ meta-data from office documents such as Microsoft Word files and
11
+ LibreOffice Writer files.
12
+
13
+ You can use `abiword` decomposer.
14
+
15
+ ## Install
16
+
17
+ Install chupa-text-decomposer-abiword gem:
18
+
19
+ ```
20
+ % gem install chupa-text-decomposer-abiword
21
+ ```
22
+
23
+ Now, you can extract text and meta-data from office documents:
24
+
25
+ ```
26
+ % chupa-text document.doc
27
+ ```
28
+
29
+ ## Author
30
+
31
+ * Sutou Kouhei `<kou@clear-code.com>`
32
+
33
+ ## License
34
+
35
+ LGPL 2.1 or later.
36
+
37
+ (Sutou Kouhei has a right to change the license including contributed
38
+ patches.)
data/Rakefile ADDED
@@ -0,0 +1,48 @@
1
+ # -*- ruby -*-
2
+ #
3
+ # Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ task :default => :test
20
+
21
+ require "pathname"
22
+
23
+ require "rubygems"
24
+ require "bundler/gem_helper"
25
+ require "packnga"
26
+
27
+ base_dir = Pathname(__FILE__).dirname
28
+
29
+ helper = Bundler::GemHelper.new(base_dir.to_s)
30
+ def helper.version_tag
31
+ version
32
+ end
33
+
34
+ helper.install
35
+ spec = helper.gemspec
36
+
37
+ Packnga::DocumentTask.new(spec) do |task|
38
+ task.original_language = "en"
39
+ task.translate_language = "ja"
40
+ end
41
+
42
+ Packnga::ReleaseTask.new(spec) do
43
+ end
44
+
45
+ desc "Run tests"
46
+ task :test do
47
+ ruby("test/run-test.rb")
48
+ end
@@ -0,0 +1,50 @@
1
+ # -*- ruby -*-
2
+ #
3
+ # Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ clean_white_space = lambda do |entry|
20
+ entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
21
+ end
22
+
23
+ Gem::Specification.new do |spec|
24
+ spec.name = "chupa-text-decomposer-abiword"
25
+ spec.version = "1.0.0"
26
+ spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-abiword"
27
+ spec.authors = ["Sutou Kouhei"]
28
+ spec.email = ["kou@clear-code.com"]
29
+ readme = File.read("README.md", encoding: "UTF-8")
30
+ entries = readme.split(/^\#\#\s(.*)$/)
31
+ description = clean_white_space.call(entries[entries.index("Description") + 1])
32
+ spec.summary = description.split(/\n\n+/, 2).first
33
+ spec.description = description
34
+ spec.license = "LGPL-2.1+"
35
+ spec.files = ["#{spec.name}.gemspec"]
36
+ spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
37
+ spec.files += [".yardopts"]
38
+ spec.files += Dir.glob("lib/**/*.rb")
39
+ spec.files += Dir.glob("doc/text/*")
40
+ spec.files += Dir.glob("test/**/*")
41
+
42
+ spec.add_runtime_dependency("chupa-text")
43
+ spec.add_runtime_dependency("chupa-text-decomposer-pdf")
44
+
45
+ spec.add_development_dependency("bundler")
46
+ spec.add_development_dependency("rake")
47
+ spec.add_development_dependency("test-unit")
48
+ spec.add_development_dependency("packnga")
49
+ spec.add_development_dependency("kramdown")
50
+ end
data/doc/text/news.md ADDED
@@ -0,0 +1,5 @@
1
+ # News
2
+
3
+ ## 1.0.0: 2019-06-13
4
+
5
+ The first release!!!
@@ -0,0 +1,132 @@
1
+ # Copyright (C) 2019 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "tempfile"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class AbiWord < Decomposer
22
+ include Loggable
23
+
24
+ registry.register("abiword", self)
25
+
26
+ EXTENSIONS = [
27
+ "abw",
28
+ "doc",
29
+ "docx",
30
+ "odt",
31
+ "rtf",
32
+ "zabw",
33
+ ]
34
+ MIME_TYPES = [
35
+ "application/msword",
36
+ "application/rtf",
37
+ "application/vnd.oasis.opendocument.text",
38
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
39
+ "application/x-abiword",
40
+ ]
41
+
42
+ def initialize(options)
43
+ super
44
+ @command = find_command
45
+ debug do
46
+ if @command
47
+ "#{log_tag}[command][found] #{@command.path}"
48
+ else
49
+ "#{log_tag}[command][not-found]"
50
+ end
51
+ end
52
+ end
53
+
54
+ def target?(data)
55
+ return false if @command.nil?
56
+ EXTENSIONS.include?(data.extension) or
57
+ MIME_TYPES.include?(data.mime_type)
58
+ end
59
+
60
+ def decompose(data)
61
+ pdf_data = convert_to_pdf(data)
62
+ return if pdf_data.nil?
63
+ yield(pdf_data)
64
+ end
65
+
66
+ private
67
+ def find_command
68
+ candidates = [
69
+ @options[:abiword],
70
+ ENV["ABIWORD"],
71
+ "abiword",
72
+ ]
73
+ candidates.each do |candidate|
74
+ next if candidate.nil?
75
+ command = ExternalCommand.new(candidate)
76
+ return command if command.exist?
77
+ end
78
+ nil
79
+ end
80
+
81
+ def convert_to_pdf(data)
82
+ create_tempfiles(data) do |pdf, stdout, stderr|
83
+ succeeded = @command.run("--to", "pdf",
84
+ "--to-name", pdf.path,
85
+ data.path.to_s,
86
+ {
87
+ data: data,
88
+ spawn_options: {
89
+ out: stdout.path,
90
+ err: stderr.path,
91
+ },
92
+ })
93
+ unless succeeded
94
+ error do
95
+ tag = "#{log_tag}[convert][exited][abnormally]"
96
+ [
97
+ tag,
98
+ "output: <#{stdout.read}>",
99
+ "error: <#{stderr.read}>",
100
+ ].join("\n")
101
+ end
102
+ return nil
103
+ end
104
+ normalized_pdf_uri = data.uri.to_s.gsub(/\.[^.]+\z/, ".pdf")
105
+ File.open(pdf.path, "rb") do |pdf_input|
106
+ VirtualFileData.new(normalized_pdf_uri,
107
+ pdf_input,
108
+ source_data: data)
109
+ end
110
+ end
111
+ end
112
+
113
+ def create_tempfiles(data)
114
+ basename = File.basename(data.path)
115
+ pdf = Tempfile.new([basename, ".pdf"])
116
+ stdout = Tempfile.new([basename, ".stdout.log"])
117
+ stderr = Tempfile.new([basename, ".stderr.log"])
118
+ begin
119
+ yield(pdf, stdout, stderr)
120
+ ensure
121
+ pdf.close!
122
+ stdout.close!
123
+ stderr.close!
124
+ end
125
+ end
126
+
127
+ def log_tag
128
+ "[decomposer][abiword]"
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,43 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
3
+ <abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="4" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
4
+ <!-- ======================================================================== -->
5
+ <!-- This file is an AbiWord document. -->
6
+ <!-- AbiWord is a free, Open Source word processor. -->
7
+ <!-- More information about AbiWord is available at http://www.abisource.com/ -->
8
+ <!-- You should not edit this file by hand. -->
9
+ <!-- ======================================================================== -->
10
+
11
+ <metadata>
12
+ <m key="abiword.date_last_changed">Thu Jun 13 16:15:17 2019
13
+ </m>
14
+ <m key="abiword.generator">AbiWord</m>
15
+ <m key="dc.date">Thu Jun 13 16:15:17 2019
16
+ </m>
17
+ <m key="dc.format">application/x-abiword</m>
18
+ <m key="meta:editing-cycles">1</m>
19
+ <m key="meta:editing-duration">P0D</m>
20
+ </metadata>
21
+ <rdf>
22
+ <t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
23
+ <t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
24
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
25
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
26
+ <t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
27
+ </rdf>
28
+ <history version="1" edit-time="16" last-saved="1560410117" uid="f5d2a67c-8daa-11e9-84b7-a514b42a119c">
29
+ <version id="1" started="1560410117" uid="ff487a92-8daa-11e9-84b7-a514b42a119c" auto="0" top-xid="4"/>
30
+ </history>
31
+ <styles>
32
+ <s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
33
+ <s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
34
+ <s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
35
+ <s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
36
+ </styles>
37
+ <pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
38
+ <section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
39
+ <p style="Normal" xid="2">Page1</p>
40
+ <p xid="3"><pbr/></p>
41
+ <p style="Normal" props="" xid="4">Page2</p>
42
+ </section>
43
+ </abiword>
@@ -0,0 +1,41 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE abiword PUBLIC "-//ABISOURCE//DTD AWML 1.0 Strict//EN" "http://www.abisource.com/awml.dtd">
3
+ <abiword template="false" xmlns:ct="http://www.abisource.com/changetracking.dtd" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:math="http://www.w3.org/1998/Math/MathML" xid-max="2" xmlns:dc="http://purl.org/dc/elements/1.1/" fileformat="1.1" xmlns:svg="http://www.w3.org/2000/svg" xmlns:awml="http://www.abisource.com/awml.dtd" xmlns="http://www.abisource.com/awml.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" version="3.0.2" xml:space="preserve" props="dom-dir:ltr; document-footnote-restart-section:0; document-endnote-type:numeric; document-endnote-place-enddoc:1; document-endnote-initial:1; lang:ja-JP; document-endnote-restart-section:0; document-footnote-restart-page:0; document-footnote-type:numeric; document-footnote-initial:1; document-endnote-place-endsection:0">
4
+ <!-- ======================================================================== -->
5
+ <!-- This file is an AbiWord document. -->
6
+ <!-- AbiWord is a free, Open Source word processor. -->
7
+ <!-- More information about AbiWord is available at http://www.abisource.com/ -->
8
+ <!-- You should not edit this file by hand. -->
9
+ <!-- ======================================================================== -->
10
+
11
+ <metadata>
12
+ <m key="abiword.date_last_changed">Thu Jun 13 16:15:31 2019
13
+ </m>
14
+ <m key="abiword.generator">AbiWord</m>
15
+ <m key="dc.date">Thu Jun 13 16:15:31 2019
16
+ </m>
17
+ <m key="dc.format">application/x-abiword</m>
18
+ <m key="meta:editing-cycles">1</m>
19
+ <m key="meta:editing-duration">P0D</m>
20
+ </metadata>
21
+ <rdf>
22
+ <t s="styles.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#StylesFile</t>
23
+ <t s="content.xml" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/odf#ContentFile</t>
24
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >styles.xml</t>
25
+ <t s="manifest.rdf" p="http://docs.oasis-open.org/ns/office/1.2/meta/pkg#hasPart" objecttype="1" xsdtype="" >content.xml</t>
26
+ <t s="manifest.rdf" p="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" objecttype="1" xsdtype="" >http://docs.oasis-open.org/ns/office/1.2/meta/pkg#Document</t>
27
+ </rdf>
28
+ <history version="1" edit-time="7" last-saved="1560410131" uid="03403cde-8dab-11e9-9847-bcd35e03657c">
29
+ <version id="1" started="1560410131" uid="07b66e0a-8dab-11e9-9847-bcd35e03657c" auto="0" top-xid="2"/>
30
+ </history>
31
+ <styles>
32
+ <s type="P" name="Normal" props="lang:en-US; default-tab-interval:1.251cm; font-size:12pt; font-family:Liberation Serif; dom-dir:ltr"/>
33
+ <s type="P" name="Caption" basedon="Normal" followedby="Caption" props="margin-top:0.212cm; font-size:12pt; margin-bottom:0.212cm; font-style:italic"/>
34
+ <s type="P" name="Heading" basedon="Normal" followedby="Text body" props="margin-top:0.423cm; keep-with-next:yes; margin-bottom:0.212cm; font-family:Liberation Sans; font-size:14pt"/>
35
+ <s type="P" name="Text body" basedon="Normal" followedby="Text body" props="margin-bottom:0.212cm; margin-top:0cm"/>
36
+ </styles>
37
+ <pagesize pagetype="A4" orientation="portrait" width="210.000000" height="297.000000" units="mm" page-scale="1.000000"/>
38
+ <section xid="1" props="page-margin-right:2cm; page-width:21.001cm; page-margin-left:2cm; page-orientation:portrait; page-margin-bottom:2cm; page-margin-top:2cm; page-height:29.7cm">
39
+ <p style="Normal" xid="2">Page1</p>
40
+ </section>
41
+ </abiword>
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,19 @@
1
+ {\rtf1\ansi\deff3\adeflang1025
2
+ {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
3
+ {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
4
+ {\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
5
+ {\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
6
+ {\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
7
+ {\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
8
+ {\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
9
+ {\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
10
+ }{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min35}{\revtim\yr2014\mo1\dy5\hr15\min36}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
11
+ \viewscale110
12
+ {\*\pgdsctbl
13
+ {\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
14
+ \formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
15
+ {\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
16
+ Page1}
17
+ \par \pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033\pagebb{\rtlch \ltrch\loch
18
+ Page2}
19
+ \par }
@@ -0,0 +1,17 @@
1
+ {\rtf1\ansi\deff3\adeflang1025
2
+ {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset128 \'83\'82\'83\'67\'83\'84L\'83\'7d\'83\'8b\'83\'78\'83\'8a3\'93\'99\'95\'9d;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset0 Lohit Devanagari;}}
3
+ {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
4
+ {\stylesheet{\s0\snext0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033 Normal;}
5
+ {\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 \u35211\'3f\u20986\'3f\u12375\'3f;}
6
+ {\s16\sbasedon0\snext16\sb0\sa120 \u26412\'3f\u25991\'3f;}
7
+ {\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 \u12522\'3f\u12473\'3f\u12488\'3f;}
8
+ {\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 \u12461\'3f\u12515\'3f\u12503\'3f\u12471\'3f\u12519\'3f\u12531\'3f;}
9
+ {\s19\sbasedon0\snext19\noline\dbch\af7 \u32034\'3f\u24341\'3f;}
10
+ }{\*\generator LibreOffice/6.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2}{\info{\creatim\yr2014\mo1\dy5\hr15\min34}{\revtim\yr2014\mo1\dy5\hr15\min35}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
11
+ \viewscale110
12
+ {\*\pgdsctbl
13
+ {\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 \u27161\'3f\u28310\'3f\u12473\'3f\u12479\'3f\u12452\'3f\u12523\'3f;}}
14
+ \formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
15
+ {\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\nowidctlpar\hyphpar0\aspalpha\ltrpar\cf0\kerning1\dbch\af5\langfe1041\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang1033{\rtlch \ltrch\loch
16
+ Page1}
17
+ \par }
Binary file
Binary file
data/test/helper.rb ADDED
@@ -0,0 +1,57 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+
19
+ module FixtureHelper
20
+ def fixture_path(*components)
21
+ base_path = Pathname(__dir__) + "fixture"
22
+ base_path.join(*components)
23
+ end
24
+ end
25
+
26
+ module DecomposeHelper
27
+ def decompose(path)
28
+ data = ChupaText::InputData.new(path)
29
+
30
+ pdf_decomposer = ChupaText::Decomposers::PDF.new({})
31
+ decomposed = []
32
+ @decomposer.decompose(data) do |decomposed_data|
33
+ if pdf_decomposer.target?(decomposed_data)
34
+ pdf_decomposer.decompose(decomposed_data) do |pdf_decomposed_data|
35
+ decomposed << pdf_decomposed_data
36
+ end
37
+ else
38
+ decomposed << decomposed_data
39
+ end
40
+ end
41
+ decomposed
42
+ end
43
+
44
+ def normalize_producers(producers)
45
+ producers.collect do |producer|
46
+ normalize_producer(producer)
47
+ end
48
+ end
49
+
50
+ def normalize_producer(producer)
51
+ if /\Acairo \d+\.\d+\.\d+ \(https:\/\/cairographics\.org\)\z/ =~ producer
52
+ "cairo"
53
+ else
54
+ producer
55
+ end
56
+ end
57
+ end