chupa-text 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +5 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +502 -0
- data/README.md +91 -0
- data/Rakefile +46 -0
- data/bin/chupa-text +21 -0
- data/bin/chupa-text-generate-decomposer +21 -0
- data/chupa-text.gemspec +58 -0
- data/data/chupa-text.conf +5 -0
- data/data/mime-types.conf +19 -0
- data/doc/text/command-line.md +136 -0
- data/doc/text/decomposer.md +343 -0
- data/doc/text/library.md +72 -0
- data/doc/text/news.md +5 -0
- data/lib/chupa-text.rb +37 -0
- data/lib/chupa-text/command.rb +18 -0
- data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
- data/lib/chupa-text/command/chupa-text.rb +102 -0
- data/lib/chupa-text/configuration-loader.rb +95 -0
- data/lib/chupa-text/configuration.rb +49 -0
- data/lib/chupa-text/data.rb +149 -0
- data/lib/chupa-text/decomposer-registry.rb +37 -0
- data/lib/chupa-text/decomposer.rb +37 -0
- data/lib/chupa-text/decomposers.rb +59 -0
- data/lib/chupa-text/decomposers/csv.rb +44 -0
- data/lib/chupa-text/decomposers/gzip.rb +51 -0
- data/lib/chupa-text/decomposers/tar.rb +42 -0
- data/lib/chupa-text/decomposers/xml.rb +55 -0
- data/lib/chupa-text/extractor.rb +91 -0
- data/lib/chupa-text/file-content.rb +35 -0
- data/lib/chupa-text/formatters.rb +17 -0
- data/lib/chupa-text/formatters/json.rb +60 -0
- data/lib/chupa-text/input-data.rb +58 -0
- data/lib/chupa-text/mime-type-registry.rb +41 -0
- data/lib/chupa-text/mime-type.rb +36 -0
- data/lib/chupa-text/text-data.rb +26 -0
- data/lib/chupa-text/version.rb +19 -0
- data/lib/chupa-text/virtual-content.rb +91 -0
- data/lib/chupa-text/virtual-file-data.rb +46 -0
- data/test/command/test-chupa-text.rb +178 -0
- data/test/decomposers/test-csv.rb +48 -0
- data/test/decomposers/test-gzip.rb +113 -0
- data/test/decomposers/test-tar.rb +78 -0
- data/test/decomposers/test-xml.rb +58 -0
- data/test/fixture/command/chupa-text/hello.txt +1 -0
- data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
- data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
- data/test/fixture/extractor/hello.txt +1 -0
- data/test/fixture/gzip/hello.tar.gz +0 -0
- data/test/fixture/gzip/hello.tgz +0 -0
- data/test/fixture/gzip/hello.txt.gz +0 -0
- data/test/fixture/tar/directory.tar +0 -0
- data/test/fixture/tar/top-level.tar +0 -0
- data/test/helper.rb +25 -0
- data/test/run-test.rb +35 -0
- data/test/test-configuration-loader.rb +54 -0
- data/test/test-data.rb +85 -0
- data/test/test-decomposer-registry.rb +30 -0
- data/test/test-decomposer.rb +41 -0
- data/test/test-decomposers.rb +59 -0
- data/test/test-extractor.rb +125 -0
- data/test/test-file-content.rb +51 -0
- data/test/test-mime-type-registry.rb +48 -0
- data/test/test-text-data.rb +36 -0
- data/test/test-virtual-content.rb +103 -0
- metadata +183 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "optparse"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Command
|
21
|
+
class ChupaText
|
22
|
+
class << self
|
23
|
+
def run(*arguments)
|
24
|
+
chupa_text = new
|
25
|
+
chupa_text.run(*arguments)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
@input = nil
|
31
|
+
@configuration = Configuration.default
|
32
|
+
end
|
33
|
+
|
34
|
+
def run(*arguments)
|
35
|
+
return false unless parse_arguments(arguments)
|
36
|
+
|
37
|
+
Decomposers.load
|
38
|
+
extractor = create_extractor
|
39
|
+
data = create_data
|
40
|
+
formatter = create_formatter
|
41
|
+
formatter.format_start(data)
|
42
|
+
extractor.extract(data) do |extracted|
|
43
|
+
formatter.format_extracted(extracted)
|
44
|
+
end
|
45
|
+
formatter.format_finish(data)
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def load_configuration(path)
|
51
|
+
loader = ConfigurationLoader.new(@configuration)
|
52
|
+
loader.load(path)
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_arguments(arguments)
|
56
|
+
parser = create_option_parser
|
57
|
+
rest = nil
|
58
|
+
begin
|
59
|
+
rest = parser.parse!(arguments)
|
60
|
+
rescue OptionParser::ParseError
|
61
|
+
puts($!.message)
|
62
|
+
return false
|
63
|
+
end
|
64
|
+
if rest.size > 1
|
65
|
+
puts(parser.help)
|
66
|
+
return false
|
67
|
+
end
|
68
|
+
@input, = rest
|
69
|
+
true
|
70
|
+
end
|
71
|
+
|
72
|
+
def create_option_parser
|
73
|
+
parser = OptionParser.new
|
74
|
+
parser.banner += " [FILE_OR_URI]"
|
75
|
+
parser.version = VERSION
|
76
|
+
parser.on("--configuration=FILE",
|
77
|
+
"Read configuration from FILE.") do |path|
|
78
|
+
load_configuration(path)
|
79
|
+
end
|
80
|
+
parser
|
81
|
+
end
|
82
|
+
|
83
|
+
def create_extractor
|
84
|
+
extractor = Extractor.new
|
85
|
+
extractor.apply_configuration(@configuration)
|
86
|
+
extractor
|
87
|
+
end
|
88
|
+
|
89
|
+
def create_data
|
90
|
+
if @input.nil?
|
91
|
+
VirtualFileData.new(nil, $stdin)
|
92
|
+
else
|
93
|
+
InputData.new(@input)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def create_formatter
|
98
|
+
Formatters::JSON.new($stdout)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "pathname"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
class ConfigurationLoader
|
21
|
+
attr_reader :decomposer
|
22
|
+
attr_reader :mime_type
|
23
|
+
def initialize(configuration)
|
24
|
+
@configuration = configuration
|
25
|
+
@decomposer = DecomposerLoader.new(@configuration.decomposer)
|
26
|
+
@mime_type = MIMETypeLoader.new(@configuration.mime_type_registry)
|
27
|
+
@load_paths = []
|
28
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
29
|
+
@load_paths << File.expand_path(data_dir)
|
30
|
+
end
|
31
|
+
|
32
|
+
def load(path)
|
33
|
+
path = resolve_path(path)
|
34
|
+
File.open(path) do |file|
|
35
|
+
instance_eval(file.read, path, 1)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def resolve_path(path)
|
41
|
+
return path if File.exist?(path)
|
42
|
+
return path if Pathname(path).absolute?
|
43
|
+
@load_paths.each do |load_path|
|
44
|
+
resolved_path = File.join(load_path, path)
|
45
|
+
return resolved_path if File.exist?(resolved_path)
|
46
|
+
end
|
47
|
+
path
|
48
|
+
end
|
49
|
+
|
50
|
+
class DecomposerLoader
|
51
|
+
def initialize(configuration)
|
52
|
+
@configuration = configuration
|
53
|
+
end
|
54
|
+
|
55
|
+
def names
|
56
|
+
@configuration.names
|
57
|
+
end
|
58
|
+
|
59
|
+
def names=(names)
|
60
|
+
@configuration.names = names
|
61
|
+
end
|
62
|
+
|
63
|
+
def [](name)
|
64
|
+
@configuration.options[name]
|
65
|
+
end
|
66
|
+
|
67
|
+
def []=(name, options)
|
68
|
+
@configuration.options[name] = options
|
69
|
+
end
|
70
|
+
|
71
|
+
def method_missing(name, *arguments)
|
72
|
+
return super if block_given?
|
73
|
+
|
74
|
+
if name.to_s.end_with?("=") and arguments.size == 1
|
75
|
+
value = arguments.first
|
76
|
+
self[name.to_s.gsub(/=\z/, "")] = value
|
77
|
+
elsif arguments.empty?
|
78
|
+
self[name.to_s]
|
79
|
+
else
|
80
|
+
super
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class MIMETypeLoader
|
86
|
+
def initialize(registry)
|
87
|
+
@registry = registry
|
88
|
+
end
|
89
|
+
|
90
|
+
def []=(extension, mime_type)
|
91
|
+
@registry.register(extension, mime_type)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class Configuration
|
19
|
+
class << self
|
20
|
+
def default
|
21
|
+
@default ||= create_default
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
def create_default
|
26
|
+
configuration = new
|
27
|
+
loader = ConfigurationLoader.new(configuration)
|
28
|
+
loader.load("chupa-text.conf")
|
29
|
+
configuration
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :decomposer
|
34
|
+
attr_accessor :mime_type_registry
|
35
|
+
def initialize
|
36
|
+
@decomposer = DecomposerConfiguration.new
|
37
|
+
@mime_type_registry = MIMEType.registry
|
38
|
+
end
|
39
|
+
|
40
|
+
class DecomposerConfiguration
|
41
|
+
attr_accessor :names
|
42
|
+
attr_accessor :options
|
43
|
+
def initialize
|
44
|
+
@names = ["*"]
|
45
|
+
@options = {}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "uri"
|
18
|
+
require "open-uri"
|
19
|
+
|
20
|
+
module ChupaText
|
21
|
+
class Data
|
22
|
+
# @return [URI, nil] The URI of the data if the data is for remote
|
23
|
+
# or local file, `nil` if the data isn't associated with any
|
24
|
+
# URIs.
|
25
|
+
attr_reader :uri
|
26
|
+
|
27
|
+
# @return [String, nil] The content of the data, `nil` if the data
|
28
|
+
# doesn't have any content.
|
29
|
+
attr_accessor :body
|
30
|
+
|
31
|
+
# @return [Integer, nil] The byte size of the data, `nil` if the data
|
32
|
+
# doesn't have any content.
|
33
|
+
attr_accessor :size
|
34
|
+
|
35
|
+
# @return [String, nil] The path associated with the content of
|
36
|
+
# the data, `nil` if the data doesn't associated with any file.
|
37
|
+
#
|
38
|
+
# The path may not be related with the original content. For
|
39
|
+
# example, `"/tmp/XXX.txt"` may be returned for the data of
|
40
|
+
# `"http://example.com/XXX.txt"`.
|
41
|
+
#
|
42
|
+
# This value is useful to use an external command to extract
|
43
|
+
# text and meta-data.
|
44
|
+
attr_accessor :path
|
45
|
+
|
46
|
+
attr_accessor :attributes
|
47
|
+
|
48
|
+
# @return [Data, nil] The source of the data. For example, text
|
49
|
+
# data (`hello.txt`) in archive data (`hello.tar`) have the
|
50
|
+
# archive data in {#source}.
|
51
|
+
attr_accessor :source
|
52
|
+
|
53
|
+
def initialize
|
54
|
+
@uri = nil
|
55
|
+
@body = nil
|
56
|
+
@size = nil
|
57
|
+
@path = nil
|
58
|
+
@mime_type = nil
|
59
|
+
@attributes = {}
|
60
|
+
@source = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize_copy(object)
|
64
|
+
super
|
65
|
+
@attributes = @attributes.dup
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
# @param [String, URI, nil] uri The URI for the data. If `uri` is
|
70
|
+
# `nil`, it means that the data isn't associated with any URIs.
|
71
|
+
def uri=(uri)
|
72
|
+
case uri
|
73
|
+
when String, Pathname
|
74
|
+
uri = URI.parse(uri.to_s)
|
75
|
+
end
|
76
|
+
@uri = uri
|
77
|
+
end
|
78
|
+
|
79
|
+
def open
|
80
|
+
yield(StringIO.new(body))
|
81
|
+
end
|
82
|
+
|
83
|
+
def [](name)
|
84
|
+
@attributes[name]
|
85
|
+
end
|
86
|
+
|
87
|
+
def []=(name, value)
|
88
|
+
@attributes[name] = value
|
89
|
+
end
|
90
|
+
|
91
|
+
# @return [String] The MIME type of the data. If MIME type
|
92
|
+
# isn't set, guesses MIME type from path and body.
|
93
|
+
# @return [nil] If MIME type isn't set and it can't guess MIME type
|
94
|
+
# from path and body.
|
95
|
+
def mime_type
|
96
|
+
@mime_type || guess_mime_type
|
97
|
+
end
|
98
|
+
|
99
|
+
# @param [String, nil] type The MIME type of the data. You can
|
100
|
+
# unset MIME type by `nil`. If you unset MIME type, MIME type
|
101
|
+
# is guessed from path and body of the data.
|
102
|
+
def mime_type=(type)
|
103
|
+
@mime_type = type
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [String, nil] Normalized extension as String if {#uri}
|
107
|
+
# is not `nil`, `nil` otherwise. The normalized extension uses
|
108
|
+
# lower case like `pdf` not `PDF`.
|
109
|
+
def extension
|
110
|
+
return nil if @uri.nil?
|
111
|
+
File.extname(@uri.path).downcase.gsub(/\A\./, "")
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Bool] true if MIME type is "text/XXX", false
|
115
|
+
# otherwise.
|
116
|
+
def text?
|
117
|
+
(mime_type || "").start_with?("text/")
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
def guess_mime_type
|
122
|
+
guess_mime_type_from_uri or
|
123
|
+
guess_mime_type_from_body
|
124
|
+
end
|
125
|
+
|
126
|
+
def guess_mime_type_from_uri
|
127
|
+
MIMEType.registry.find(extension)
|
128
|
+
end
|
129
|
+
|
130
|
+
def guess_mime_type_from_body
|
131
|
+
mime_type = nil
|
132
|
+
change_encoding(body, "UTF-8") do |utf8_body|
|
133
|
+
mime_type = "text/plain" if utf8_body.valid_encoding?
|
134
|
+
end
|
135
|
+
mime_type
|
136
|
+
end
|
137
|
+
|
138
|
+
def change_encoding(string, encoding)
|
139
|
+
return if string.nil?
|
140
|
+
begin
|
141
|
+
original_encoding = string.encoding
|
142
|
+
string.force_encoding(encoding)
|
143
|
+
yield(string)
|
144
|
+
ensure
|
145
|
+
string.force_encoding(original_encoding)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class DecomposerRegistry
|
19
|
+
include Enumerable
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@decomposer_classes = {}
|
23
|
+
end
|
24
|
+
|
25
|
+
def register(name, decomposer_class)
|
26
|
+
@decomposer_classes[name] = decomposer_class
|
27
|
+
end
|
28
|
+
|
29
|
+
def find(name)
|
30
|
+
@decomposer_classes[name]
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
@decomposer_classes.each(&block)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class Decomposer
|
19
|
+
class << self
|
20
|
+
def registry
|
21
|
+
@@registry ||= DecomposerRegistry.new
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(options)
|
26
|
+
@options = options
|
27
|
+
end
|
28
|
+
|
29
|
+
def target?(data)
|
30
|
+
raise NotImplementedError, "must implement #{self.class}\##{__method__}"
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompose(data)
|
34
|
+
raise NotImplementedError, "must implement #{self.class}\##{__method__}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|