chupa-text 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +5 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +502 -0
- data/README.md +91 -0
- data/Rakefile +46 -0
- data/bin/chupa-text +21 -0
- data/bin/chupa-text-generate-decomposer +21 -0
- data/chupa-text.gemspec +58 -0
- data/data/chupa-text.conf +5 -0
- data/data/mime-types.conf +19 -0
- data/doc/text/command-line.md +136 -0
- data/doc/text/decomposer.md +343 -0
- data/doc/text/library.md +72 -0
- data/doc/text/news.md +5 -0
- data/lib/chupa-text.rb +37 -0
- data/lib/chupa-text/command.rb +18 -0
- data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
- data/lib/chupa-text/command/chupa-text.rb +102 -0
- data/lib/chupa-text/configuration-loader.rb +95 -0
- data/lib/chupa-text/configuration.rb +49 -0
- data/lib/chupa-text/data.rb +149 -0
- data/lib/chupa-text/decomposer-registry.rb +37 -0
- data/lib/chupa-text/decomposer.rb +37 -0
- data/lib/chupa-text/decomposers.rb +59 -0
- data/lib/chupa-text/decomposers/csv.rb +44 -0
- data/lib/chupa-text/decomposers/gzip.rb +51 -0
- data/lib/chupa-text/decomposers/tar.rb +42 -0
- data/lib/chupa-text/decomposers/xml.rb +55 -0
- data/lib/chupa-text/extractor.rb +91 -0
- data/lib/chupa-text/file-content.rb +35 -0
- data/lib/chupa-text/formatters.rb +17 -0
- data/lib/chupa-text/formatters/json.rb +60 -0
- data/lib/chupa-text/input-data.rb +58 -0
- data/lib/chupa-text/mime-type-registry.rb +41 -0
- data/lib/chupa-text/mime-type.rb +36 -0
- data/lib/chupa-text/text-data.rb +26 -0
- data/lib/chupa-text/version.rb +19 -0
- data/lib/chupa-text/virtual-content.rb +91 -0
- data/lib/chupa-text/virtual-file-data.rb +46 -0
- data/test/command/test-chupa-text.rb +178 -0
- data/test/decomposers/test-csv.rb +48 -0
- data/test/decomposers/test-gzip.rb +113 -0
- data/test/decomposers/test-tar.rb +78 -0
- data/test/decomposers/test-xml.rb +58 -0
- data/test/fixture/command/chupa-text/hello.txt +1 -0
- data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
- data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
- data/test/fixture/extractor/hello.txt +1 -0
- data/test/fixture/gzip/hello.tar.gz +0 -0
- data/test/fixture/gzip/hello.tgz +0 -0
- data/test/fixture/gzip/hello.txt.gz +0 -0
- data/test/fixture/tar/directory.tar +0 -0
- data/test/fixture/tar/top-level.tar +0 -0
- data/test/helper.rb +25 -0
- data/test/run-test.rb +35 -0
- data/test/test-configuration-loader.rb +54 -0
- data/test/test-data.rb +85 -0
- data/test/test-decomposer-registry.rb +30 -0
- data/test/test-decomposer.rb +41 -0
- data/test/test-decomposers.rb +59 -0
- data/test/test-extractor.rb +125 -0
- data/test/test-file-content.rb +51 -0
- data/test/test-mime-type-registry.rb +48 -0
- data/test/test-text-data.rb +36 -0
- data/test/test-virtual-content.rb +103 -0
- metadata +183 -0
data/doc/text/library.md
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# Hot to use ChupaText as Ruby library
|
2
|
+
|
3
|
+
You can use ChupaText as Ruby library. If you want to extract text
|
4
|
+
data from many input data, `chupa-text` command may be
|
5
|
+
inefficient. You need to execute `chupa-text` command to process one
|
6
|
+
input file. You need to execute `chupa-text` command N times to
|
7
|
+
process N input files. It means that you need to initializes ChupaText
|
8
|
+
N times. It may be inefficient.
|
9
|
+
|
10
|
+
You can reduce initializations of ChupaText by using ChupaText as Ruby
|
11
|
+
library.
|
12
|
+
|
13
|
+
Here is a simple usage:
|
14
|
+
|
15
|
+
```
|
16
|
+
require "chupa-text"
|
17
|
+
gem "chupa-text-decomposer-html"
|
18
|
+
|
19
|
+
ChupaText::Decomposers.load
|
20
|
+
|
21
|
+
extractor = ChupaText::Extractor.new
|
22
|
+
extractor.apply_configuration(ChupaText::Configuration.default)
|
23
|
+
|
24
|
+
extractor.extract("http://ranguba.org/") do |text_data|
|
25
|
+
puts(text_data.body)
|
26
|
+
end
|
27
|
+
extractor.extract("http://ranguba.org/ja/") do |text_data|
|
28
|
+
puts(text_data.body)
|
29
|
+
end
|
30
|
+
```
|
31
|
+
|
32
|
+
It is better that you use Bundler to manager decomposer plugins:
|
33
|
+
|
34
|
+
```
|
35
|
+
# Gemfile
|
36
|
+
source "https://rubygems.org"
|
37
|
+
|
38
|
+
gem "chupa-text-decomposer-html"
|
39
|
+
gem "chupa-text-decomposer-XXX"
|
40
|
+
# ...
|
41
|
+
```
|
42
|
+
|
43
|
+
Here is a usage that uses the Gemfile:
|
44
|
+
|
45
|
+
```
|
46
|
+
require "bundler/setup"
|
47
|
+
|
48
|
+
ChupaText::Decomposers.load
|
49
|
+
|
50
|
+
extractor = ChupaText::Extractor.new
|
51
|
+
extractor.apply_configuration(ChupaText::Configuration.default)
|
52
|
+
|
53
|
+
extractor.extract("http://ranguba.org/") do |text_data|
|
54
|
+
puts(text_data.body)
|
55
|
+
end
|
56
|
+
extractor.extract("http://ranguba.org/ja/") do |text_data|
|
57
|
+
puts(text_data.body)
|
58
|
+
end
|
59
|
+
```
|
60
|
+
|
61
|
+
Use {ChupaText::Data#[]} to get meta-data from extracted text
|
62
|
+
data. For example, you can get title from input HTML:
|
63
|
+
|
64
|
+
```
|
65
|
+
extractor.extract("http://ranguba.org/") do |text_data|
|
66
|
+
puts(text_data["title"])
|
67
|
+
end
|
68
|
+
```
|
69
|
+
|
70
|
+
It is depended on decomposer that what meta-data can be got. See
|
71
|
+
decomposer's documentation to know about it.
|
72
|
+
|
data/doc/text/news.md
ADDED
data/lib/chupa-text.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "chupa-text/version"
|
18
|
+
|
19
|
+
require "chupa-text/configuration"
|
20
|
+
require "chupa-text/configuration-loader"
|
21
|
+
require "chupa-text/decomposer"
|
22
|
+
require "chupa-text/decomposer-registry"
|
23
|
+
require "chupa-text/decomposers"
|
24
|
+
require "chupa-text/extractor"
|
25
|
+
require "chupa-text/formatters"
|
26
|
+
require "chupa-text/mime-type"
|
27
|
+
require "chupa-text/mime-type-registry"
|
28
|
+
|
29
|
+
require "chupa-text/file-content"
|
30
|
+
require "chupa-text/virtual-content"
|
31
|
+
|
32
|
+
require "chupa-text/data"
|
33
|
+
require "chupa-text/input-data"
|
34
|
+
require "chupa-text/virtual-file-data"
|
35
|
+
require "chupa-text/text-data"
|
36
|
+
|
37
|
+
require "chupa-text/command"
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "chupa-text/command/chupa-text"
|
18
|
+
require "chupa-text/command/chupa-text-generate-decomposer"
|
@@ -0,0 +1,324 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "optparse"
|
18
|
+
require "etc"
|
19
|
+
|
20
|
+
module ChupaText
|
21
|
+
module Command
|
22
|
+
class ChupaTextGenerateDecomposer
|
23
|
+
class << self
|
24
|
+
def run(*arguments)
|
25
|
+
command = new
|
26
|
+
command.run(*arguments)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize
|
31
|
+
@name = nil
|
32
|
+
@extensions = nil
|
33
|
+
@mime_types = nil
|
34
|
+
@author = guess_author
|
35
|
+
@email = guess_email
|
36
|
+
@license = lgplv2_1_or_later_license
|
37
|
+
@parser = create_option_parser
|
38
|
+
end
|
39
|
+
|
40
|
+
def run(*arguments)
|
41
|
+
begin
|
42
|
+
@parser.parse!(arguments)
|
43
|
+
rescue OptionParser::ParseError
|
44
|
+
puts($!.message)
|
45
|
+
return false
|
46
|
+
end
|
47
|
+
read_missing_parameters
|
48
|
+
generate
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
def guess_author
|
54
|
+
author = guess_author_from_password_entry
|
55
|
+
author ||= ENV["USERNAME"]
|
56
|
+
author
|
57
|
+
end
|
58
|
+
|
59
|
+
def guess_author_from_password_entry
|
60
|
+
password_entry = find_password_entry
|
61
|
+
return nil if password_entry.nil?
|
62
|
+
|
63
|
+
author = password_entry.gecos.split(/,/).first.strip
|
64
|
+
author = nil if author.empty?
|
65
|
+
author
|
66
|
+
end
|
67
|
+
|
68
|
+
def find_password_entry
|
69
|
+
Etc.getpwuid
|
70
|
+
rescue ArgumentError
|
71
|
+
nil
|
72
|
+
end
|
73
|
+
|
74
|
+
def guess_email
|
75
|
+
ENV["EMAIL"]
|
76
|
+
end
|
77
|
+
|
78
|
+
def lgplv2_1_or_later_license
|
79
|
+
"LGPLv2.1 or later"
|
80
|
+
end
|
81
|
+
|
82
|
+
def create_option_parser
|
83
|
+
parser = OptionParser.new
|
84
|
+
parser.version = VERSION
|
85
|
+
parser.on("--name=NAME",
|
86
|
+
"Decomposer name",
|
87
|
+
"(e.g.: html)") do |name|
|
88
|
+
@name = name
|
89
|
+
end
|
90
|
+
parser.on("--extensions=EXTENSION1,EXTENSION2,...", Array,
|
91
|
+
"Target file extensions",
|
92
|
+
"(e.g.: htm,html,xhtml)") do |extensions|
|
93
|
+
@extensions = extensions
|
94
|
+
end
|
95
|
+
parser.on("--mime-types=TYPE1,TYPE2,...", Array,
|
96
|
+
"Target MIME types",
|
97
|
+
"(e.g.: text/html,application/xhtml+xml)") do |mime_types|
|
98
|
+
@mime_types = mime_types
|
99
|
+
end
|
100
|
+
parser.on("--author=AUTHOR",
|
101
|
+
"Author",
|
102
|
+
"(e.g.: 'Your Name')",
|
103
|
+
"(default: #{@author})") do |author|
|
104
|
+
@author = author
|
105
|
+
end
|
106
|
+
parser.on("--email=EMAIL",
|
107
|
+
"Author E-mail",
|
108
|
+
"(e.g.: your@email.address)",
|
109
|
+
"(default: #{@email})") do |email|
|
110
|
+
@email = email
|
111
|
+
end
|
112
|
+
parser.on("--license=LICENSE",
|
113
|
+
"License",
|
114
|
+
"(e.g.: MIT)",
|
115
|
+
"(default: #{@license})") do |license|
|
116
|
+
@license = license
|
117
|
+
end
|
118
|
+
parser
|
119
|
+
end
|
120
|
+
|
121
|
+
def read_missing_parameters
|
122
|
+
@name ||= read_parameter("--name")
|
123
|
+
@extensions ||= read_parameter("--extensions")
|
124
|
+
@mime_types ||= read_parameter("--mime-types")
|
125
|
+
@author ||= read_parameter("--author")
|
126
|
+
@email ||= read_parameter("--email")
|
127
|
+
@license ||= read_parameter("--license")
|
128
|
+
end
|
129
|
+
|
130
|
+
def read_parameter(long_option_name)
|
131
|
+
target_option = @parser.top.list.find do |option|
|
132
|
+
option.long.include?(long_option_name)
|
133
|
+
end
|
134
|
+
prompt = target_option.desc.join(" ") + ": "
|
135
|
+
print(prompt)
|
136
|
+
target_option.conv.call($stdin.gets.chomp)
|
137
|
+
end
|
138
|
+
|
139
|
+
def gem_name
|
140
|
+
"chupa-text-decomposer-#{@name}"
|
141
|
+
end
|
142
|
+
|
143
|
+
def generate
|
144
|
+
generate_gemspec
|
145
|
+
generate_gemfile
|
146
|
+
generate_rakefile
|
147
|
+
generate_license
|
148
|
+
generate_decomposer
|
149
|
+
generate_test
|
150
|
+
generate_test_helper
|
151
|
+
generate_test_runner
|
152
|
+
end
|
153
|
+
|
154
|
+
def generate_gemspec
|
155
|
+
create_file("#{gem_name}.gemspec") do |file|
|
156
|
+
file.puts(<<-GEMSPEC)
|
157
|
+
# -*- mode: ruby; coding: utf-8 -*-
|
158
|
+
|
159
|
+
Gem::Specification.new do |spec|
|
160
|
+
spec.name = "#{gem_name}"
|
161
|
+
spec.version = "1.0.0"
|
162
|
+
spec.author = "#{@author}"
|
163
|
+
spec.email = "#{@email}"
|
164
|
+
spec.summary = "ChupaText decomposer for #{@mime_types.join(' ')}."
|
165
|
+
spec.description = spec.summary
|
166
|
+
spec.license = "#{@license}"
|
167
|
+
spec.files = ["\#{spec.name}.gemspec"]
|
168
|
+
spec.files += Dir.glob("{README*,LICENSE*,Rakefile,Gemfile}")
|
169
|
+
spec.files += Dir.glob("lib/**/*.rb")
|
170
|
+
spec.files += Dir.glob("test/fixture/**/*")
|
171
|
+
spec.files += Dir.glob("test/**/*.rb")
|
172
|
+
|
173
|
+
spec.add_runtime_dependency("chupa-text")
|
174
|
+
|
175
|
+
spec.add_development_dependency("bundler")
|
176
|
+
spec.add_development_dependency("rake")
|
177
|
+
spec.add_development_dependency("test-unit")
|
178
|
+
end
|
179
|
+
GEMSPEC
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def generate_gemfile
|
184
|
+
create_file("Gemfile") do |file|
|
185
|
+
file.puts(<<-Gemfile)
|
186
|
+
# -*- mode: ruby; coding: utf-8 -*-
|
187
|
+
|
188
|
+
source "https://rubygems.org/"
|
189
|
+
|
190
|
+
gemspec
|
191
|
+
Gemfile
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def generate_rakefile
|
196
|
+
create_file("Rakefile") do |file|
|
197
|
+
file.puts(<<-RAKEFILE)
|
198
|
+
# -*- mode: ruby; coding: utf-8 -*-
|
199
|
+
|
200
|
+
require "bundler/gem_tasks"
|
201
|
+
|
202
|
+
task :default => :test
|
203
|
+
|
204
|
+
desc "Run tests"
|
205
|
+
task :test do
|
206
|
+
ruby("test/run-test.rb")
|
207
|
+
end
|
208
|
+
RAKEFILE
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def generate_license
|
213
|
+
return unless @license == lgplv2_1_or_later_license
|
214
|
+
base_dir = File.join(File.dirname(__FILE__), "..", "..", "..")
|
215
|
+
lgpl2_1_license_file = File.join(base_dir, "LICENSE.txt")
|
216
|
+
create_file("LICENSE.txt") do |file|
|
217
|
+
file.puts(File.read(lgpl2_1_license_file))
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
def generate_decomposer
|
222
|
+
create_file("lib/chupa-text/decomposers/#{@name}.rb") do |file|
|
223
|
+
file.puts(<<-RUBY)
|
224
|
+
module ChupaText
|
225
|
+
module Decomposers
|
226
|
+
class #{@name.capitalize} < Decomposer
|
227
|
+
def target?(data)
|
228
|
+
#{@extensions.inspect}.include?(data.extension) or
|
229
|
+
#{@mime_types.inspect}.include?(data.mime_type)
|
230
|
+
end
|
231
|
+
|
232
|
+
def decompose(data)
|
233
|
+
raise NotImplementedError, "\#{self.class}\#\#{__method__} isn't implemented yet."
|
234
|
+
text = "IMPLEMENTED ME"
|
235
|
+
text_data = TextData.new(text)
|
236
|
+
yield(text_data)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
RUBY
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def generate_test
|
246
|
+
create_file("test/test-#{@name}.rb") do |file|
|
247
|
+
file.puts(<<-RUBY)
|
248
|
+
class Test#{@name.capitalize} < Test::Unit::TestCase
|
249
|
+
include Helper
|
250
|
+
|
251
|
+
def setup
|
252
|
+
@decomposer = ChupaText::Decomposers::#{@name.capitalize}.new({})
|
253
|
+
end
|
254
|
+
|
255
|
+
sub_test_case("decompose") do
|
256
|
+
def decompose(input_body)
|
257
|
+
data = ChupaText::Data.new
|
258
|
+
data.mime_type = #{@mime_types.first.dump}
|
259
|
+
data.body = input_body
|
260
|
+
|
261
|
+
decomposed = []
|
262
|
+
@decomposer.decompose(data) do |decomposed_data|
|
263
|
+
decomposed << decomposed_data
|
264
|
+
end
|
265
|
+
decomposed
|
266
|
+
end
|
267
|
+
|
268
|
+
def test_body
|
269
|
+
input_body = "TODO (input)"
|
270
|
+
expected_text = "TODO (extracted)"
|
271
|
+
assert_equal([expected_text],
|
272
|
+
decompose(input_body).collect(&:body))
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
RUBY
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def generate_test_helper
|
281
|
+
create_file("test/helper.rb") do |file|
|
282
|
+
file.puts(<<-RUBY)
|
283
|
+
module Helper
|
284
|
+
def fixture_path(*components)
|
285
|
+
base_dir = File.expand_path(File.dirname(__FILE__))
|
286
|
+
File.join(base_dir, "fixture", *components)
|
287
|
+
end
|
288
|
+
end
|
289
|
+
RUBY
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
def generate_test_runner
|
294
|
+
create_file("test/run-test.rb") do |file|
|
295
|
+
file.puts(<<-RUBY)
|
296
|
+
#!/usr/bin/env ruby
|
297
|
+
|
298
|
+
require "bundler/setup"
|
299
|
+
|
300
|
+
require "test-unit"
|
301
|
+
|
302
|
+
require "chupa-text"
|
303
|
+
ChupaText::Decomposers.load
|
304
|
+
|
305
|
+
require_relative "helper"
|
306
|
+
|
307
|
+
exit(Test::Unit::AutoRunner.run(true))
|
308
|
+
RUBY
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def create_file(path, &block)
|
313
|
+
real_path = File.join(gem_name, path)
|
314
|
+
directory = File.dirname(real_path)
|
315
|
+
unless File.exist?(directory)
|
316
|
+
puts("Creating directory: #{directory}")
|
317
|
+
FileUtils.mkdir_p(directory)
|
318
|
+
end
|
319
|
+
puts("Creating file: #{real_path}")
|
320
|
+
File.open(real_path, "w", &block)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|