chupa-text 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,59 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ module Decomposers
19
+ class << self
20
+ def load
21
+ paths = []
22
+ $LOAD_PATH.each do |load_path|
23
+ next unless File.directory?(load_path)
24
+ Dir.chdir(load_path) do
25
+ Dir.glob("chupa-text/decomposers/*.rb") do |decomposer_path|
26
+ paths << decomposer_path.gsub(/\.rb\z/, "")
27
+ end
28
+ end
29
+ end
30
+ paths.each do |path|
31
+ require path
32
+ end
33
+ end
34
+
35
+ def create(registry, configuration)
36
+ enabled_names = resolve_names(registry, configuration.names)
37
+ enabled_names.collect do |enabled_name|
38
+ decomposer_class = registry.find(enabled_name)
39
+ options = configuration.options[name] || {}
40
+ decomposer_class.new(options)
41
+ end
42
+ end
43
+
44
+ private
45
+ def resolve_names(registry, enabled_names)
46
+ resolved_names = []
47
+ flag = 0
48
+ flag |= File::FNM_EXTGLOB if File.const_defined?(:FNM_EXTGLOB)
49
+ enabled_names.each do |enabled_name|
50
+ registry.each do |name,|
51
+ next unless File.fnmatch(enabled_name, name, flag)
52
+ resolved_names << name
53
+ end
54
+ end
55
+ resolved_names
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,44 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "csv"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class CSV < Decomposer
22
+ registry.register("csv", self)
23
+
24
+ def target?(data)
25
+ data.extension == "csv" or
26
+ data.mime_type == "text/csv"
27
+ end
28
+
29
+ def decompose(data)
30
+ text = ""
31
+ data.open do |input|
32
+ csv = ::CSV.new(input)
33
+ csv.each do |row|
34
+ text << row.join(" ")
35
+ text << "\n"
36
+ end
37
+ end
38
+ text_data = TextData.new(text)
39
+ text_data.uri = data.uri
40
+ yield(text_data)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "stringio"
18
+ require "zlib"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class Gzip < Decomposer
23
+ registry.register("gzip", self)
24
+
25
+ TARGET_EXTENSIONS = ["gz", "tgz"]
26
+ TARGET_MIME_TYPES = [
27
+ "application/gzip",
28
+ "application/x-gzip",
29
+ "application/x-gtar-compressed",
30
+ ]
31
+ def target?(data)
32
+ TARGET_EXTENSIONS.include?(data.extension) or
33
+ TARGET_MIME_TYPES.include?(data.mime_type)
34
+ end
35
+
36
+ def decompose(data)
37
+ reader = Zlib::GzipReader.new(StringIO.new(data.body))
38
+ uri = nil
39
+ case data.extension
40
+ when "gz"
41
+ uri = data.uri.to_s.gsub(/\.gz\z/i, "")
42
+ when "tgz"
43
+ uri = data.uri.to_s.gsub(/\.tgz\z/i, ".tar")
44
+ end
45
+ extracted = VirtualFileData.new(uri, reader)
46
+ extracted.source = data
47
+ yield(extracted)
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,42 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "stringio"
18
+ require "rubygems/package"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class Tar < Decomposer
23
+ registry.register("tar", self)
24
+
25
+ def target?(data)
26
+ data.extension == "tar" or
27
+ data.mime_type == "application/x-tar"
28
+ end
29
+
30
+ def decompose(data)
31
+ Gem::Package::TarReader.new(StringIO.new(data.body)) do |reader|
32
+ reader.each do |entry|
33
+ next unless entry.file?
34
+ extracted = VirtualFileData.new(entry.full_name, entry)
35
+ extracted.source = data
36
+ yield(extracted)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,55 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "rexml/document"
18
+ require "rexml/streamlistener"
19
+
20
+ module ChupaText
21
+ module Decomposers
22
+ class XML < Decomposer
23
+ registry.register("xml", self)
24
+
25
+ def target?(data)
26
+ data.extension == "xml" or
27
+ data.mime_type == "text/xml"
28
+ end
29
+
30
+ def decompose(data)
31
+ text = ""
32
+ listener = Listener.new(text)
33
+ data.open do |input|
34
+ parser = REXML::Parsers::StreamParser.new(input, listener)
35
+ parser.parse
36
+ end
37
+ text_data = TextData.new(text)
38
+ text_data.uri = data.uri
39
+ yield(text_data)
40
+ end
41
+
42
+ class Listener
43
+ include REXML::StreamListener
44
+
45
+ def initialize(output)
46
+ @output = output
47
+ end
48
+
49
+ def text(text)
50
+ @output << text
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,91 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+ require "uri"
19
+
20
+ module ChupaText
21
+ class Extractor
22
+ def initialize
23
+ @decomposers = []
24
+ end
25
+
26
+ # Sets the extractor up by the configuration. It adds decomposers
27
+ # enabled in the configuration.
28
+ #
29
+ # @param [Configuration] configuration The configuration to be
30
+ # applied.
31
+ #
32
+ # @return [void]
33
+ def apply_configuration(configuration)
34
+ decomposers = Decomposers.create(Decomposer.registry,
35
+ configuration.decomposer)
36
+ decomposers.each do |decomposer|
37
+ add_decomposer(decomposer)
38
+ end
39
+ end
40
+
41
+ def add_decomposer(decomposer)
42
+ @decomposers << decomposer
43
+ end
44
+
45
+ # Extracts texts from input. Each extracted text is passes to the
46
+ # given block.
47
+ #
48
+ # @param [Data, String] input The input to be extracted texts.
49
+ # If `input` is `String`, it is treated as the local file path or URI
50
+ # of input data.
51
+ #
52
+ # @yield [text_data] Gives extracted text data to the block.
53
+ # The block may be called zero or more times.
54
+ # @yieldparam [Data] text_data The extracted text data.
55
+ # You can get text data by `text_data.body`.
56
+ #
57
+ # @return [void]
58
+ def extract(input)
59
+ targets = [ensure_data(input)]
60
+ until targets.empty?
61
+ target = targets.pop
62
+ decomposer = find_decomposer(target)
63
+ if decomposer.nil?
64
+ yield(target) if target.text?
65
+ next
66
+ end
67
+ decomposer.decompose(target) do |decomposed|
68
+ targets.push(decomposed)
69
+ end
70
+ end
71
+ end
72
+
73
+ private
74
+ def ensure_data(input)
75
+ case input
76
+ when String, Pathname, URI::Generic
77
+ data = Data.new
78
+ data.uri = input.to_s
79
+ data
80
+ else
81
+ input
82
+ end
83
+ end
84
+
85
+ def find_decomposer(data)
86
+ @decomposers.find do |decomposer|
87
+ decomposer.target?(data)
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,35 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class FileContent
19
+ attr_reader :size
20
+ attr_reader :path
21
+
22
+ def initialize(path)
23
+ @path = path
24
+ @size = File.size(@path)
25
+ end
26
+
27
+ def open(&block)
28
+ File.open(@path, "rb", &block)
29
+ end
30
+
31
+ def body
32
+ @body ||= open {|file| file.read}
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,17 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/formatters/json"
@@ -0,0 +1,60 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "json"
18
+
19
+ module ChupaText
20
+ module Formatters
21
+ class JSON
22
+ def initialize(output)
23
+ @output = output
24
+ @formatted = {}
25
+ end
26
+
27
+ def format_start(data)
28
+ format_headers(data, @formatted)
29
+ @formatted["texts"] = []
30
+ end
31
+
32
+ def format_extracted(data)
33
+ text = {}
34
+ format_headers(data, text)
35
+ text["body"] = data.body
36
+ @formatted["texts"] << text
37
+ end
38
+
39
+ def format_finish(data)
40
+ @output << ::JSON.pretty_generate(@formatted)
41
+ @output << "\n"
42
+ end
43
+
44
+ private
45
+ def format_headers(data, target)
46
+ format_header("mime-type", data.mime_type, target)
47
+ format_header("uri", data.uri, target)
48
+ format_header("size", data.size, target)
49
+ data.attributes.each do |name, value|
50
+ format_header(name, value, target)
51
+ end
52
+ end
53
+
54
+ def format_header(name, value, target)
55
+ return if value.nil?
56
+ target[name] = value
57
+ end
58
+ end
59
+ end
60
+ end