chupa-text 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,72 @@
1
+ # Hot to use ChupaText as Ruby library
2
+
3
+ You can use ChupaText as Ruby library. If you want to extract text
4
+ data from many input data, `chupa-text` command may be
5
+ inefficient. You need to execute `chupa-text` command to process one
6
+ input file. You need to execute `chupa-text` command N times to
7
+ process N input files. It means that you need to initializes ChupaText
8
+ N times. It may be inefficient.
9
+
10
+ You can reduce initializations of ChupaText by using ChupaText as Ruby
11
+ library.
12
+
13
+ Here is a simple usage:
14
+
15
+ ```
16
+ require "chupa-text"
17
+ gem "chupa-text-decomposer-html"
18
+
19
+ ChupaText::Decomposers.load
20
+
21
+ extractor = ChupaText::Extractor.new
22
+ extractor.apply_configuration(ChupaText::Configuration.default)
23
+
24
+ extractor.extract("http://ranguba.org/") do |text_data|
25
+ puts(text_data.body)
26
+ end
27
+ extractor.extract("http://ranguba.org/ja/") do |text_data|
28
+ puts(text_data.body)
29
+ end
30
+ ```
31
+
32
+ It is better that you use Bundler to manager decomposer plugins:
33
+
34
+ ```
35
+ # Gemfile
36
+ source "https://rubygems.org"
37
+
38
+ gem "chupa-text-decomposer-html"
39
+ gem "chupa-text-decomposer-XXX"
40
+ # ...
41
+ ```
42
+
43
+ Here is a usage that uses the Gemfile:
44
+
45
+ ```
46
+ require "bundler/setup"
47
+
48
+ ChupaText::Decomposers.load
49
+
50
+ extractor = ChupaText::Extractor.new
51
+ extractor.apply_configuration(ChupaText::Configuration.default)
52
+
53
+ extractor.extract("http://ranguba.org/") do |text_data|
54
+ puts(text_data.body)
55
+ end
56
+ extractor.extract("http://ranguba.org/ja/") do |text_data|
57
+ puts(text_data.body)
58
+ end
59
+ ```
60
+
61
+ Use {ChupaText::Data#[]} to get meta-data from extracted text
62
+ data. For example, you can get title from input HTML:
63
+
64
+ ```
65
+ extractor.extract("http://ranguba.org/") do |text_data|
66
+ puts(text_data["title"])
67
+ end
68
+ ```
69
+
70
+ It is depended on decomposer that what meta-data can be got. See
71
+ decomposer's documentation to know about it.
72
+
@@ -0,0 +1,5 @@
1
+ # News
2
+
3
+ ## 1.0.0: 2014-01-05
4
+
5
+ The first release!!!
@@ -0,0 +1,37 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/version"
18
+
19
+ require "chupa-text/configuration"
20
+ require "chupa-text/configuration-loader"
21
+ require "chupa-text/decomposer"
22
+ require "chupa-text/decomposer-registry"
23
+ require "chupa-text/decomposers"
24
+ require "chupa-text/extractor"
25
+ require "chupa-text/formatters"
26
+ require "chupa-text/mime-type"
27
+ require "chupa-text/mime-type-registry"
28
+
29
+ require "chupa-text/file-content"
30
+ require "chupa-text/virtual-content"
31
+
32
+ require "chupa-text/data"
33
+ require "chupa-text/input-data"
34
+ require "chupa-text/virtual-file-data"
35
+ require "chupa-text/text-data"
36
+
37
+ require "chupa-text/command"
@@ -0,0 +1,18 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/command/chupa-text"
18
+ require "chupa-text/command/chupa-text-generate-decomposer"
@@ -0,0 +1,324 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "optparse"
18
+ require "etc"
19
+
20
+ module ChupaText
21
+ module Command
22
+ class ChupaTextGenerateDecomposer
23
+ class << self
24
+ def run(*arguments)
25
+ command = new
26
+ command.run(*arguments)
27
+ end
28
+ end
29
+
30
+ def initialize
31
+ @name = nil
32
+ @extensions = nil
33
+ @mime_types = nil
34
+ @author = guess_author
35
+ @email = guess_email
36
+ @license = lgplv2_1_or_later_license
37
+ @parser = create_option_parser
38
+ end
39
+
40
+ def run(*arguments)
41
+ begin
42
+ @parser.parse!(arguments)
43
+ rescue OptionParser::ParseError
44
+ puts($!.message)
45
+ return false
46
+ end
47
+ read_missing_parameters
48
+ generate
49
+ true
50
+ end
51
+
52
+ private
53
+ def guess_author
54
+ author = guess_author_from_password_entry
55
+ author ||= ENV["USERNAME"]
56
+ author
57
+ end
58
+
59
+ def guess_author_from_password_entry
60
+ password_entry = find_password_entry
61
+ return nil if password_entry.nil?
62
+
63
+ author = password_entry.gecos.split(/,/).first.strip
64
+ author = nil if author.empty?
65
+ author
66
+ end
67
+
68
+ def find_password_entry
69
+ Etc.getpwuid
70
+ rescue ArgumentError
71
+ nil
72
+ end
73
+
74
+ def guess_email
75
+ ENV["EMAIL"]
76
+ end
77
+
78
+ def lgplv2_1_or_later_license
79
+ "LGPLv2.1 or later"
80
+ end
81
+
82
+ def create_option_parser
83
+ parser = OptionParser.new
84
+ parser.version = VERSION
85
+ parser.on("--name=NAME",
86
+ "Decomposer name",
87
+ "(e.g.: html)") do |name|
88
+ @name = name
89
+ end
90
+ parser.on("--extensions=EXTENSION1,EXTENSION2,...", Array,
91
+ "Target file extensions",
92
+ "(e.g.: htm,html,xhtml)") do |extensions|
93
+ @extensions = extensions
94
+ end
95
+ parser.on("--mime-types=TYPE1,TYPE2,...", Array,
96
+ "Target MIME types",
97
+ "(e.g.: text/html,application/xhtml+xml)") do |mime_types|
98
+ @mime_types = mime_types
99
+ end
100
+ parser.on("--author=AUTHOR",
101
+ "Author",
102
+ "(e.g.: 'Your Name')",
103
+ "(default: #{@author})") do |author|
104
+ @author = author
105
+ end
106
+ parser.on("--email=EMAIL",
107
+ "Author E-mail",
108
+ "(e.g.: your@email.address)",
109
+ "(default: #{@email})") do |email|
110
+ @email = email
111
+ end
112
+ parser.on("--license=LICENSE",
113
+ "License",
114
+ "(e.g.: MIT)",
115
+ "(default: #{@license})") do |license|
116
+ @license = license
117
+ end
118
+ parser
119
+ end
120
+
121
+ def read_missing_parameters
122
+ @name ||= read_parameter("--name")
123
+ @extensions ||= read_parameter("--extensions")
124
+ @mime_types ||= read_parameter("--mime-types")
125
+ @author ||= read_parameter("--author")
126
+ @email ||= read_parameter("--email")
127
+ @license ||= read_parameter("--license")
128
+ end
129
+
130
+ def read_parameter(long_option_name)
131
+ target_option = @parser.top.list.find do |option|
132
+ option.long.include?(long_option_name)
133
+ end
134
+ prompt = target_option.desc.join(" ") + ": "
135
+ print(prompt)
136
+ target_option.conv.call($stdin.gets.chomp)
137
+ end
138
+
139
+ def gem_name
140
+ "chupa-text-decomposer-#{@name}"
141
+ end
142
+
143
+ def generate
144
+ generate_gemspec
145
+ generate_gemfile
146
+ generate_rakefile
147
+ generate_license
148
+ generate_decomposer
149
+ generate_test
150
+ generate_test_helper
151
+ generate_test_runner
152
+ end
153
+
154
+ def generate_gemspec
155
+ create_file("#{gem_name}.gemspec") do |file|
156
+ file.puts(<<-GEMSPEC)
157
+ # -*- mode: ruby; coding: utf-8 -*-
158
+
159
+ Gem::Specification.new do |spec|
160
+ spec.name = "#{gem_name}"
161
+ spec.version = "1.0.0"
162
+ spec.author = "#{@author}"
163
+ spec.email = "#{@email}"
164
+ spec.summary = "ChupaText decomposer for #{@mime_types.join(' ')}."
165
+ spec.description = spec.summary
166
+ spec.license = "#{@license}"
167
+ spec.files = ["\#{spec.name}.gemspec"]
168
+ spec.files += Dir.glob("{README*,LICENSE*,Rakefile,Gemfile}")
169
+ spec.files += Dir.glob("lib/**/*.rb")
170
+ spec.files += Dir.glob("test/fixture/**/*")
171
+ spec.files += Dir.glob("test/**/*.rb")
172
+
173
+ spec.add_runtime_dependency("chupa-text")
174
+
175
+ spec.add_development_dependency("bundler")
176
+ spec.add_development_dependency("rake")
177
+ spec.add_development_dependency("test-unit")
178
+ end
179
+ GEMSPEC
180
+ end
181
+ end
182
+
183
+ def generate_gemfile
184
+ create_file("Gemfile") do |file|
185
+ file.puts(<<-Gemfile)
186
+ # -*- mode: ruby; coding: utf-8 -*-
187
+
188
+ source "https://rubygems.org/"
189
+
190
+ gemspec
191
+ Gemfile
192
+ end
193
+ end
194
+
195
+ def generate_rakefile
196
+ create_file("Rakefile") do |file|
197
+ file.puts(<<-RAKEFILE)
198
+ # -*- mode: ruby; coding: utf-8 -*-
199
+
200
+ require "bundler/gem_tasks"
201
+
202
+ task :default => :test
203
+
204
+ desc "Run tests"
205
+ task :test do
206
+ ruby("test/run-test.rb")
207
+ end
208
+ RAKEFILE
209
+ end
210
+ end
211
+
212
+ def generate_license
213
+ return unless @license == lgplv2_1_or_later_license
214
+ base_dir = File.join(File.dirname(__FILE__), "..", "..", "..")
215
+ lgpl2_1_license_file = File.join(base_dir, "LICENSE.txt")
216
+ create_file("LICENSE.txt") do |file|
217
+ file.puts(File.read(lgpl2_1_license_file))
218
+ end
219
+ end
220
+
221
+ def generate_decomposer
222
+ create_file("lib/chupa-text/decomposers/#{@name}.rb") do |file|
223
+ file.puts(<<-RUBY)
224
+ module ChupaText
225
+ module Decomposers
226
+ class #{@name.capitalize} < Decomposer
227
+ def target?(data)
228
+ #{@extensions.inspect}.include?(data.extension) or
229
+ #{@mime_types.inspect}.include?(data.mime_type)
230
+ end
231
+
232
+ def decompose(data)
233
+ raise NotImplementedError, "\#{self.class}\#\#{__method__} isn't implemented yet."
234
+ text = "IMPLEMENTED ME"
235
+ text_data = TextData.new(text)
236
+ yield(text_data)
237
+ end
238
+ end
239
+ end
240
+ end
241
+ RUBY
242
+ end
243
+ end
244
+
245
+ def generate_test
246
+ create_file("test/test-#{@name}.rb") do |file|
247
+ file.puts(<<-RUBY)
248
+ class Test#{@name.capitalize} < Test::Unit::TestCase
249
+ include Helper
250
+
251
+ def setup
252
+ @decomposer = ChupaText::Decomposers::#{@name.capitalize}.new({})
253
+ end
254
+
255
+ sub_test_case("decompose") do
256
+ def decompose(input_body)
257
+ data = ChupaText::Data.new
258
+ data.mime_type = #{@mime_types.first.dump}
259
+ data.body = input_body
260
+
261
+ decomposed = []
262
+ @decomposer.decompose(data) do |decomposed_data|
263
+ decomposed << decomposed_data
264
+ end
265
+ decomposed
266
+ end
267
+
268
+ def test_body
269
+ input_body = "TODO (input)"
270
+ expected_text = "TODO (extracted)"
271
+ assert_equal([expected_text],
272
+ decompose(input_body).collect(&:body))
273
+ end
274
+ end
275
+ end
276
+ RUBY
277
+ end
278
+ end
279
+
280
+ def generate_test_helper
281
+ create_file("test/helper.rb") do |file|
282
+ file.puts(<<-RUBY)
283
+ module Helper
284
+ def fixture_path(*components)
285
+ base_dir = File.expand_path(File.dirname(__FILE__))
286
+ File.join(base_dir, "fixture", *components)
287
+ end
288
+ end
289
+ RUBY
290
+ end
291
+ end
292
+
293
+ def generate_test_runner
294
+ create_file("test/run-test.rb") do |file|
295
+ file.puts(<<-RUBY)
296
+ #!/usr/bin/env ruby
297
+
298
+ require "bundler/setup"
299
+
300
+ require "test-unit"
301
+
302
+ require "chupa-text"
303
+ ChupaText::Decomposers.load
304
+
305
+ require_relative "helper"
306
+
307
+ exit(Test::Unit::AutoRunner.run(true))
308
+ RUBY
309
+ end
310
+ end
311
+
312
+ def create_file(path, &block)
313
+ real_path = File.join(gem_name, path)
314
+ directory = File.dirname(real_path)
315
+ unless File.exist?(directory)
316
+ puts("Creating directory: #{directory}")
317
+ FileUtils.mkdir_p(directory)
318
+ end
319
+ puts("Creating file: #{real_path}")
320
+ File.open(real_path, "w", &block)
321
+ end
322
+ end
323
+ end
324
+ end