chupa-text 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,102 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "optparse"
18
+
19
+ module ChupaText
20
+ module Command
21
+ class ChupaText
22
+ class << self
23
+ def run(*arguments)
24
+ chupa_text = new
25
+ chupa_text.run(*arguments)
26
+ end
27
+ end
28
+
29
+ def initialize
30
+ @input = nil
31
+ @configuration = Configuration.default
32
+ end
33
+
34
+ def run(*arguments)
35
+ return false unless parse_arguments(arguments)
36
+
37
+ Decomposers.load
38
+ extractor = create_extractor
39
+ data = create_data
40
+ formatter = create_formatter
41
+ formatter.format_start(data)
42
+ extractor.extract(data) do |extracted|
43
+ formatter.format_extracted(extracted)
44
+ end
45
+ formatter.format_finish(data)
46
+ true
47
+ end
48
+
49
+ private
50
+ def load_configuration(path)
51
+ loader = ConfigurationLoader.new(@configuration)
52
+ loader.load(path)
53
+ end
54
+
55
+ def parse_arguments(arguments)
56
+ parser = create_option_parser
57
+ rest = nil
58
+ begin
59
+ rest = parser.parse!(arguments)
60
+ rescue OptionParser::ParseError
61
+ puts($!.message)
62
+ return false
63
+ end
64
+ if rest.size > 1
65
+ puts(parser.help)
66
+ return false
67
+ end
68
+ @input, = rest
69
+ true
70
+ end
71
+
72
+ def create_option_parser
73
+ parser = OptionParser.new
74
+ parser.banner += " [FILE_OR_URI]"
75
+ parser.version = VERSION
76
+ parser.on("--configuration=FILE",
77
+ "Read configuration from FILE.") do |path|
78
+ load_configuration(path)
79
+ end
80
+ parser
81
+ end
82
+
83
+ def create_extractor
84
+ extractor = Extractor.new
85
+ extractor.apply_configuration(@configuration)
86
+ extractor
87
+ end
88
+
89
+ def create_data
90
+ if @input.nil?
91
+ VirtualFileData.new(nil, $stdin)
92
+ else
93
+ InputData.new(@input)
94
+ end
95
+ end
96
+
97
+ def create_formatter
98
+ Formatters::JSON.new($stdout)
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,95 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "pathname"
18
+
19
+ module ChupaText
20
+ class ConfigurationLoader
21
+ attr_reader :decomposer
22
+ attr_reader :mime_type
23
+ def initialize(configuration)
24
+ @configuration = configuration
25
+ @decomposer = DecomposerLoader.new(@configuration.decomposer)
26
+ @mime_type = MIMETypeLoader.new(@configuration.mime_type_registry)
27
+ @load_paths = []
28
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
29
+ @load_paths << File.expand_path(data_dir)
30
+ end
31
+
32
+ def load(path)
33
+ path = resolve_path(path)
34
+ File.open(path) do |file|
35
+ instance_eval(file.read, path, 1)
36
+ end
37
+ end
38
+
39
+ private
40
+ def resolve_path(path)
41
+ return path if File.exist?(path)
42
+ return path if Pathname(path).absolute?
43
+ @load_paths.each do |load_path|
44
+ resolved_path = File.join(load_path, path)
45
+ return resolved_path if File.exist?(resolved_path)
46
+ end
47
+ path
48
+ end
49
+
50
+ class DecomposerLoader
51
+ def initialize(configuration)
52
+ @configuration = configuration
53
+ end
54
+
55
+ def names
56
+ @configuration.names
57
+ end
58
+
59
+ def names=(names)
60
+ @configuration.names = names
61
+ end
62
+
63
+ def [](name)
64
+ @configuration.options[name]
65
+ end
66
+
67
+ def []=(name, options)
68
+ @configuration.options[name] = options
69
+ end
70
+
71
+ def method_missing(name, *arguments)
72
+ return super if block_given?
73
+
74
+ if name.to_s.end_with?("=") and arguments.size == 1
75
+ value = arguments.first
76
+ self[name.to_s.gsub(/=\z/, "")] = value
77
+ elsif arguments.empty?
78
+ self[name.to_s]
79
+ else
80
+ super
81
+ end
82
+ end
83
+ end
84
+
85
+ class MIMETypeLoader
86
+ def initialize(registry)
87
+ @registry = registry
88
+ end
89
+
90
+ def []=(extension, mime_type)
91
+ @registry.register(extension, mime_type)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,49 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class Configuration
19
+ class << self
20
+ def default
21
+ @default ||= create_default
22
+ end
23
+
24
+ private
25
+ def create_default
26
+ configuration = new
27
+ loader = ConfigurationLoader.new(configuration)
28
+ loader.load("chupa-text.conf")
29
+ configuration
30
+ end
31
+ end
32
+
33
+ attr_reader :decomposer
34
+ attr_accessor :mime_type_registry
35
+ def initialize
36
+ @decomposer = DecomposerConfiguration.new
37
+ @mime_type_registry = MIMEType.registry
38
+ end
39
+
40
+ class DecomposerConfiguration
41
+ attr_accessor :names
42
+ attr_accessor :options
43
+ def initialize
44
+ @names = ["*"]
45
+ @options = {}
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,149 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "uri"
18
+ require "open-uri"
19
+
20
+ module ChupaText
21
+ class Data
22
+ # @return [URI, nil] The URI of the data if the data is for remote
23
+ # or local file, `nil` if the data isn't associated with any
24
+ # URIs.
25
+ attr_reader :uri
26
+
27
+ # @return [String, nil] The content of the data, `nil` if the data
28
+ # doesn't have any content.
29
+ attr_accessor :body
30
+
31
+ # @return [Integer, nil] The byte size of the data, `nil` if the data
32
+ # doesn't have any content.
33
+ attr_accessor :size
34
+
35
+ # @return [String, nil] The path associated with the content of
36
+ # the data, `nil` if the data doesn't associated with any file.
37
+ #
38
+ # The path may not be related with the original content. For
39
+ # example, `"/tmp/XXX.txt"` may be returned for the data of
40
+ # `"http://example.com/XXX.txt"`.
41
+ #
42
+ # This value is useful to use an external command to extract
43
+ # text and meta-data.
44
+ attr_accessor :path
45
+
46
+ attr_accessor :attributes
47
+
48
+ # @return [Data, nil] The source of the data. For example, text
49
+ # data (`hello.txt`) in archive data (`hello.tar`) have the
50
+ # archive data in {#source}.
51
+ attr_accessor :source
52
+
53
+ def initialize
54
+ @uri = nil
55
+ @body = nil
56
+ @size = nil
57
+ @path = nil
58
+ @mime_type = nil
59
+ @attributes = {}
60
+ @source = nil
61
+ end
62
+
63
+ def initialize_copy(object)
64
+ super
65
+ @attributes = @attributes.dup
66
+ self
67
+ end
68
+
69
+ # @param [String, URI, nil] uri The URI for the data. If `uri` is
70
+ # `nil`, it means that the data isn't associated with any URIs.
71
+ def uri=(uri)
72
+ case uri
73
+ when String, Pathname
74
+ uri = URI.parse(uri.to_s)
75
+ end
76
+ @uri = uri
77
+ end
78
+
79
+ def open
80
+ yield(StringIO.new(body))
81
+ end
82
+
83
+ def [](name)
84
+ @attributes[name]
85
+ end
86
+
87
+ def []=(name, value)
88
+ @attributes[name] = value
89
+ end
90
+
91
+ # @return [String] The MIME type of the data. If MIME type
92
+ # isn't set, guesses MIME type from path and body.
93
+ # @return [nil] If MIME type isn't set and it can't guess MIME type
94
+ # from path and body.
95
+ def mime_type
96
+ @mime_type || guess_mime_type
97
+ end
98
+
99
+ # @param [String, nil] type The MIME type of the data. You can
100
+ # unset MIME type by `nil`. If you unset MIME type, MIME type
101
+ # is guessed from path and body of the data.
102
+ def mime_type=(type)
103
+ @mime_type = type
104
+ end
105
+
106
+ # @return [String, nil] Normalized extension as String if {#uri}
107
+ # is not `nil`, `nil` otherwise. The normalized extension uses
108
+ # lower case like `pdf` not `PDF`.
109
+ def extension
110
+ return nil if @uri.nil?
111
+ File.extname(@uri.path).downcase.gsub(/\A\./, "")
112
+ end
113
+
114
+ # @return [Bool] true if MIME type is "text/XXX", false
115
+ # otherwise.
116
+ def text?
117
+ (mime_type || "").start_with?("text/")
118
+ end
119
+
120
+ private
121
+ def guess_mime_type
122
+ guess_mime_type_from_uri or
123
+ guess_mime_type_from_body
124
+ end
125
+
126
+ def guess_mime_type_from_uri
127
+ MIMEType.registry.find(extension)
128
+ end
129
+
130
+ def guess_mime_type_from_body
131
+ mime_type = nil
132
+ change_encoding(body, "UTF-8") do |utf8_body|
133
+ mime_type = "text/plain" if utf8_body.valid_encoding?
134
+ end
135
+ mime_type
136
+ end
137
+
138
+ def change_encoding(string, encoding)
139
+ return if string.nil?
140
+ begin
141
+ original_encoding = string.encoding
142
+ string.force_encoding(encoding)
143
+ yield(string)
144
+ ensure
145
+ string.force_encoding(original_encoding)
146
+ end
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,37 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class DecomposerRegistry
19
+ include Enumerable
20
+
21
+ def initialize
22
+ @decomposer_classes = {}
23
+ end
24
+
25
+ def register(name, decomposer_class)
26
+ @decomposer_classes[name] = decomposer_class
27
+ end
28
+
29
+ def find(name)
30
+ @decomposer_classes[name]
31
+ end
32
+
33
+ def each(&block)
34
+ @decomposer_classes.each(&block)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class Decomposer
19
+ class << self
20
+ def registry
21
+ @@registry ||= DecomposerRegistry.new
22
+ end
23
+ end
24
+
25
+ def initialize(options)
26
+ @options = options
27
+ end
28
+
29
+ def target?(data)
30
+ raise NotImplementedError, "must implement #{self.class}\##{__method__}"
31
+ end
32
+
33
+ def decompose(data)
34
+ raise NotImplementedError, "must implement #{self.class}\##{__method__}"
35
+ end
36
+ end
37
+ end