chupa-text 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,102 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "optparse"
18
+
19
+ module ChupaText
20
+ module Command
21
+ class ChupaText
22
+ class << self
23
+ def run(*arguments)
24
+ chupa_text = new
25
+ chupa_text.run(*arguments)
26
+ end
27
+ end
28
+
29
+ def initialize
30
+ @input = nil
31
+ @configuration = Configuration.default
32
+ end
33
+
34
+ def run(*arguments)
35
+ return false unless parse_arguments(arguments)
36
+
37
+ Decomposers.load
38
+ extractor = create_extractor
39
+ data = create_data
40
+ formatter = create_formatter
41
+ formatter.format_start(data)
42
+ extractor.extract(data) do |extracted|
43
+ formatter.format_extracted(extracted)
44
+ end
45
+ formatter.format_finish(data)
46
+ true
47
+ end
48
+
49
+ private
50
+ def load_configuration(path)
51
+ loader = ConfigurationLoader.new(@configuration)
52
+ loader.load(path)
53
+ end
54
+
55
+ def parse_arguments(arguments)
56
+ parser = create_option_parser
57
+ rest = nil
58
+ begin
59
+ rest = parser.parse!(arguments)
60
+ rescue OptionParser::ParseError
61
+ puts($!.message)
62
+ return false
63
+ end
64
+ if rest.size > 1
65
+ puts(parser.help)
66
+ return false
67
+ end
68
+ @input, = rest
69
+ true
70
+ end
71
+
72
+ def create_option_parser
73
+ parser = OptionParser.new
74
+ parser.banner += " [FILE_OR_URI]"
75
+ parser.version = VERSION
76
+ parser.on("--configuration=FILE",
77
+ "Read configuration from FILE.") do |path|
78
+ load_configuration(path)
79
+ end
80
+ parser
81
+ end
82
+
83
+ def create_extractor
84
+ extractor = Extractor.new
85
+ extractor.apply_configuration(@configuration)
86
+ extractor
87
+ end
88
+
89
+ def create_data
90
+ if @input.nil?
91
+ VirtualFileData.new(nil, $stdin)
92
+ else
93
+ InputData.new(@input)
94
+ end
95
+ end
96
+
97
+ def create_formatter
98
+ Formatters::JSON.new($stdout)
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,95 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+
19
+ module ChupaText
20
+ class ConfigurationLoader
21
+ attr_reader :decomposer
22
+ attr_reader :mime_type
23
+ def initialize(configuration)
24
+ @configuration = configuration
25
+ @decomposer = DecomposerLoader.new(@configuration.decomposer)
26
+ @mime_type = MIMETypeLoader.new(@configuration.mime_type_registry)
27
+ @load_paths = []
28
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
29
+ @load_paths << File.expand_path(data_dir)
30
+ end
31
+
32
+ def load(path)
33
+ path = resolve_path(path)
34
+ File.open(path) do |file|
35
+ instance_eval(file.read, path, 1)
36
+ end
37
+ end
38
+
39
+ private
40
+ def resolve_path(path)
41
+ return path if File.exist?(path)
42
+ return path if Pathname(path).absolute?
43
+ @load_paths.each do |load_path|
44
+ resolved_path = File.join(load_path, path)
45
+ return resolved_path if File.exist?(resolved_path)
46
+ end
47
+ path
48
+ end
49
+
50
+ class DecomposerLoader
51
+ def initialize(configuration)
52
+ @configuration = configuration
53
+ end
54
+
55
+ def names
56
+ @configuration.names
57
+ end
58
+
59
+ def names=(names)
60
+ @configuration.names = names
61
+ end
62
+
63
+ def [](name)
64
+ @configuration.options[name]
65
+ end
66
+
67
+ def []=(name, options)
68
+ @configuration.options[name] = options
69
+ end
70
+
71
+ def method_missing(name, *arguments)
72
+ return super if block_given?
73
+
74
+ if name.to_s.end_with?("=") and arguments.size == 1
75
+ value = arguments.first
76
+ self[name.to_s.gsub(/=\z/, "")] = value
77
+ elsif arguments.empty?
78
+ self[name.to_s]
79
+ else
80
+ super
81
+ end
82
+ end
83
+ end
84
+
85
+ class MIMETypeLoader
86
+ def initialize(registry)
87
+ @registry = registry
88
+ end
89
+
90
+ def []=(extension, mime_type)
91
+ @registry.register(extension, mime_type)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,49 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class Configuration
19
+ class << self
20
+ def default
21
+ @default ||= create_default
22
+ end
23
+
24
+ private
25
+ def create_default
26
+ configuration = new
27
+ loader = ConfigurationLoader.new(configuration)
28
+ loader.load("chupa-text.conf")
29
+ configuration
30
+ end
31
+ end
32
+
33
+ attr_reader :decomposer
34
+ attr_accessor :mime_type_registry
35
+ def initialize
36
+ @decomposer = DecomposerConfiguration.new
37
+ @mime_type_registry = MIMEType.registry
38
+ end
39
+
40
+ class DecomposerConfiguration
41
+ attr_accessor :names
42
+ attr_accessor :options
43
+ def initialize
44
+ @names = ["*"]
45
+ @options = {}
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,149 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "uri"
18
+ require "open-uri"
19
+
20
+ module ChupaText
21
+ class Data
22
+ # @return [URI, nil] The URI of the data if the data is for remote
23
+ # or local file, `nil` if the data isn't associated with any
24
+ # URIs.
25
+ attr_reader :uri
26
+
27
+ # @return [String, nil] The content of the data, `nil` if the data
28
+ # doesn't have any content.
29
+ attr_accessor :body
30
+
31
+ # @return [Integer, nil] The byte size of the data, `nil` if the data
32
+ # doesn't have any content.
33
+ attr_accessor :size
34
+
35
+ # @return [String, nil] The path associated with the content of
36
+ # the data, `nil` if the data doesn't associated with any file.
37
+ #
38
+ # The path may not be related with the original content. For
39
+ # example, `"/tmp/XXX.txt"` may be returned for the data of
40
+ # `"http://example.com/XXX.txt"`.
41
+ #
42
+ # This value is useful to use an external command to extract
43
+ # text and meta-data.
44
+ attr_accessor :path
45
+
46
+ attr_accessor :attributes
47
+
48
+ # @return [Data, nil] The source of the data. For example, text
49
+ # data (`hello.txt`) in archive data (`hello.tar`) have the
50
+ # archive data in {#source}.
51
+ attr_accessor :source
52
+
53
+ def initialize
54
+ @uri = nil
55
+ @body = nil
56
+ @size = nil
57
+ @path = nil
58
+ @mime_type = nil
59
+ @attributes = {}
60
+ @source = nil
61
+ end
62
+
63
+ def initialize_copy(object)
64
+ super
65
+ @attributes = @attributes.dup
66
+ self
67
+ end
68
+
69
+ # @param [String, URI, nil] uri The URI for the data. If `uri` is
70
+ # `nil`, it means that the data isn't associated with any URIs.
71
+ def uri=(uri)
72
+ case uri
73
+ when String, Pathname
74
+ uri = URI.parse(uri.to_s)
75
+ end
76
+ @uri = uri
77
+ end
78
+
79
+ def open
80
+ yield(StringIO.new(body))
81
+ end
82
+
83
+ def [](name)
84
+ @attributes[name]
85
+ end
86
+
87
+ def []=(name, value)
88
+ @attributes[name] = value
89
+ end
90
+
91
+ # @return [String] The MIME type of the data. If MIME type
92
+ # isn't set, guesses MIME type from path and body.
93
+ # @return [nil] If MIME type isn't set and it can't guess MIME type
94
+ # from path and body.
95
+ def mime_type
96
+ @mime_type || guess_mime_type
97
+ end
98
+
99
+ # @param [String, nil] type The MIME type of the data. You can
100
+ # unset MIME type by `nil`. If you unset MIME type, MIME type
101
+ # is guessed from path and body of the data.
102
+ def mime_type=(type)
103
+ @mime_type = type
104
+ end
105
+
106
+ # @return [String, nil] Normalized extension as String if {#uri}
107
+ # is not `nil`, `nil` otherwise. The normalized extension uses
108
+ # lower case like `pdf` not `PDF`.
109
+ def extension
110
+ return nil if @uri.nil?
111
+ File.extname(@uri.path).downcase.gsub(/\A\./, "")
112
+ end
113
+
114
+ # @return [Bool] true if MIME type is "text/XXX", false
115
+ # otherwise.
116
+ def text?
117
+ (mime_type || "").start_with?("text/")
118
+ end
119
+
120
+ private
121
+ def guess_mime_type
122
+ guess_mime_type_from_uri or
123
+ guess_mime_type_from_body
124
+ end
125
+
126
+ def guess_mime_type_from_uri
127
+ MIMEType.registry.find(extension)
128
+ end
129
+
130
+ def guess_mime_type_from_body
131
+ mime_type = nil
132
+ change_encoding(body, "UTF-8") do |utf8_body|
133
+ mime_type = "text/plain" if utf8_body.valid_encoding?
134
+ end
135
+ mime_type
136
+ end
137
+
138
+ def change_encoding(string, encoding)
139
+ return if string.nil?
140
+ begin
141
+ original_encoding = string.encoding
142
+ string.force_encoding(encoding)
143
+ yield(string)
144
+ ensure
145
+ string.force_encoding(original_encoding)
146
+ end
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,37 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class DecomposerRegistry
19
+ include Enumerable
20
+
21
+ def initialize
22
+ @decomposer_classes = {}
23
+ end
24
+
25
+ def register(name, decomposer_class)
26
+ @decomposer_classes[name] = decomposer_class
27
+ end
28
+
29
+ def find(name)
30
+ @decomposer_classes[name]
31
+ end
32
+
33
+ def each(&block)
34
+ @decomposer_classes.each(&block)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class Decomposer
19
+ class << self
20
+ def registry
21
+ @@registry ||= DecomposerRegistry.new
22
+ end
23
+ end
24
+
25
+ def initialize(options)
26
+ @options = options
27
+ end
28
+
29
+ def target?(data)
30
+ raise NotImplementedError, "must implement #{self.class}\##{__method__}"
31
+ end
32
+
33
+ def decompose(data)
34
+ raise NotImplementedError, "must implement #{self.class}\##{__method__}"
35
+ end
36
+ end
37
+ end