chupa-text 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,59 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ module Decomposers
19
+ class << self
20
+ def load
21
+ paths = []
22
+ $LOAD_PATH.each do |load_path|
23
+ next unless File.directory?(load_path)
24
+ Dir.chdir(load_path) do
25
+ Dir.glob("chupa-text/decomposers/*.rb") do |decomposer_path|
26
+ paths << decomposer_path.gsub(/\.rb\z/, "")
27
+ end
28
+ end
29
+ end
30
+ paths.each do |path|
31
+ require path
32
+ end
33
+ end
34
+
35
+ def create(registry, configuration)
36
+ enabled_names = resolve_names(registry, configuration.names)
37
+ enabled_names.collect do |enabled_name|
38
+ decomposer_class = registry.find(enabled_name)
39
+ options = configuration.options[name] || {}
40
+ decomposer_class.new(options)
41
+ end
42
+ end
43
+
44
+ private
45
+ def resolve_names(registry, enabled_names)
46
+ resolved_names = []
47
+ flag = 0
48
+ flag |= File::FNM_EXTGLOB if File.const_defined?(:FNM_EXTGLOB)
49
+ enabled_names.each do |enabled_name|
50
+ registry.each do |name,|
51
+ next unless File.fnmatch(enabled_name, name, flag)
52
+ resolved_names << name
53
+ end
54
+ end
55
+ resolved_names
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,44 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "csv"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class CSV < Decomposer
22
+ registry.register("csv", self)
23
+
24
+ def target?(data)
25
+ data.extension == "csv" or
26
+ data.mime_type == "text/csv"
27
+ end
28
+
29
+ def decompose(data)
30
+ text = ""
31
+ data.open do |input|
32
+ csv = ::CSV.new(input)
33
+ csv.each do |row|
34
+ text << row.join(" ")
35
+ text << "\n"
36
+ end
37
+ end
38
+ text_data = TextData.new(text)
39
+ text_data.uri = data.uri
40
+ yield(text_data)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "stringio"
18
+ require "zlib"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class Gzip < Decomposer
23
+ registry.register("gzip", self)
24
+
25
+ TARGET_EXTENSIONS = ["gz", "tgz"]
26
+ TARGET_MIME_TYPES = [
27
+ "application/gzip",
28
+ "application/x-gzip",
29
+ "application/x-gtar-compressed",
30
+ ]
31
+ def target?(data)
32
+ TARGET_EXTENSIONS.include?(data.extension) or
33
+ TARGET_MIME_TYPES.include?(data.mime_type)
34
+ end
35
+
36
+ def decompose(data)
37
+ reader = Zlib::GzipReader.new(StringIO.new(data.body))
38
+ uri = nil
39
+ case data.extension
40
+ when "gz"
41
+ uri = data.uri.to_s.gsub(/\.gz\z/i, "")
42
+ when "tgz"
43
+ uri = data.uri.to_s.gsub(/\.tgz\z/i, ".tar")
44
+ end
45
+ extracted = VirtualFileData.new(uri, reader)
46
+ extracted.source = data
47
+ yield(extracted)
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,42 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "stringio"
18
+ require "rubygems/package"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class Tar < Decomposer
23
+ registry.register("tar", self)
24
+
25
+ def target?(data)
26
+ data.extension == "tar" or
27
+ data.mime_type == "application/x-tar"
28
+ end
29
+
30
+ def decompose(data)
31
+ Gem::Package::TarReader.new(StringIO.new(data.body)) do |reader|
32
+ reader.each do |entry|
33
+ next unless entry.file?
34
+ extracted = VirtualFileData.new(entry.full_name, entry)
35
+ extracted.source = data
36
+ yield(extracted)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,55 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "rexml/document"
18
+ require "rexml/streamlistener"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class XML < Decomposer
23
+ registry.register("xml", self)
24
+
25
+ def target?(data)
26
+ data.extension == "xml" or
27
+ data.mime_type == "text/xml"
28
+ end
29
+
30
+ def decompose(data)
31
+ text = ""
32
+ listener = Listener.new(text)
33
+ data.open do |input|
34
+ parser = REXML::Parsers::StreamParser.new(input, listener)
35
+ parser.parse
36
+ end
37
+ text_data = TextData.new(text)
38
+ text_data.uri = data.uri
39
+ yield(text_data)
40
+ end
41
+
42
+ class Listener
43
+ include REXML::StreamListener
44
+
45
+ def initialize(output)
46
+ @output = output
47
+ end
48
+
49
+ def text(text)
50
+ @output << text
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,91 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+ require "uri"
19
+
20
+ module ChupaText
21
+ class Extractor
22
+ def initialize
23
+ @decomposers = []
24
+ end
25
+
26
+ # Sets the extractor up by the configuration. It adds decomposers
27
+ # enabled in the configuration.
28
+ #
29
+ # @param [Configuration] configuration The configuration to be
30
+ # applied.
31
+ #
32
+ # @return [void]
33
+ def apply_configuration(configuration)
34
+ decomposers = Decomposers.create(Decomposer.registry,
35
+ configuration.decomposer)
36
+ decomposers.each do |decomposer|
37
+ add_decomposer(decomposer)
38
+ end
39
+ end
40
+
41
+ def add_decomposer(decomposer)
42
+ @decomposers << decomposer
43
+ end
44
+
45
+ # Extracts texts from input. Each extracted text is passes to the
46
+ # given block.
47
+ #
48
+ # @param [Data, String] input The input to be extracted texts.
49
+ # If `input` is `String`, it is treated as the local file path or URI
50
+ # of input data.
51
+ #
52
+ # @yield [text_data] Gives extracted text data to the block.
53
+ # The block may be called zero or more times.
54
+ # @yieldparam [Data] text_data The extracted text data.
55
+ # You can get text data by `text_data.body`.
56
+ #
57
+ # @return [void]
58
+ def extract(input)
59
+ targets = [ensure_data(input)]
60
+ until targets.empty?
61
+ target = targets.pop
62
+ decomposer = find_decomposer(target)
63
+ if decomposer.nil?
64
+ yield(target) if target.text?
65
+ next
66
+ end
67
+ decomposer.decompose(target) do |decomposed|
68
+ targets.push(decomposed)
69
+ end
70
+ end
71
+ end
72
+
73
+ private
74
+ def ensure_data(input)
75
+ case input
76
+ when String, Pathname, URI::Generic
77
+ data = Data.new
78
+ data.uri = input.to_s
79
+ data
80
+ else
81
+ input
82
+ end
83
+ end
84
+
85
+ def find_decomposer(data)
86
+ @decomposers.find do |decomposer|
87
+ decomposer.target?(data)
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,35 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class FileContent
19
+ attr_reader :size
20
+ attr_reader :path
21
+
22
+ def initialize(path)
23
+ @path = path
24
+ @size = File.size(@path)
25
+ end
26
+
27
+ def open(&block)
28
+ File.open(@path, "rb", &block)
29
+ end
30
+
31
+ def body
32
+ @body ||= open {|file| file.read}
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,17 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/formatters/json"
@@ -0,0 +1,60 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "json"
18
+
19
+ module ChupaText
20
+ module Formatters
21
+ class JSON
22
+ def initialize(output)
23
+ @output = output
24
+ @formatted = {}
25
+ end
26
+
27
+ def format_start(data)
28
+ format_headers(data, @formatted)
29
+ @formatted["texts"] = []
30
+ end
31
+
32
+ def format_extracted(data)
33
+ text = {}
34
+ format_headers(data, text)
35
+ text["body"] = data.body
36
+ @formatted["texts"] << text
37
+ end
38
+
39
+ def format_finish(data)
40
+ @output << ::JSON.pretty_generate(@formatted)
41
+ @output << "\n"
42
+ end
43
+
44
+ private
45
+ def format_headers(data, target)
46
+ format_header("mime-type", data.mime_type, target)
47
+ format_header("uri", data.uri, target)
48
+ format_header("size", data.size, target)
49
+ data.attributes.each do |name, value|
50
+ format_header(name, value, target)
51
+ end
52
+ end
53
+
54
+ def format_header(name, value, target)
55
+ return if value.nil?
56
+ target[name] = value
57
+ end
58
+ end
59
+ end
60
+ end