chupa-text 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,58 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "uri"
18
+ require "open-uri"
19
+
20
+ module ChupaText
21
+ class InputData < Data
22
+ def initialize(uri)
23
+ super()
24
+ self.uri = uri
25
+ if @uri.class == URI::Generic
26
+ @content = FileContent.new(@uri.path)
27
+ else
28
+ @content = download
29
+ end
30
+ end
31
+
32
+ def body
33
+ @content.body
34
+ end
35
+
36
+ def size
37
+ @content.size
38
+ end
39
+
40
+ def path
41
+ @content.path
42
+ end
43
+
44
+ def open(&block)
45
+ @content.open(&block)
46
+ end
47
+
48
+ private
49
+ def download
50
+ path = @uri.path
51
+ path += "index.html" if path.end_with?("/")
52
+ @uri.open("rb") do |input|
53
+ self.mime_type = input.content_type.split(/;/).first
54
+ VirtualContent.new(input, path)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,41 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class MIMETypeRegistry
19
+ def initialize
20
+ @from_extension_map = {}
21
+ end
22
+
23
+ def register(extension, mime_type)
24
+ @from_extension_map[normalize_extension(extension)] = mime_type
25
+ end
26
+
27
+ def find(extension)
28
+ @from_extension_map[normalize_extension(extension)]
29
+ end
30
+
31
+ def clear
32
+ @from_extension_map.clear
33
+ end
34
+
35
+ private
36
+ def normalize_extension(extension)
37
+ return nil if extension.nil?
38
+ extension.to_s.downcase.gsub(/\A\./, "")
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,36 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ module MIMEType
19
+ class << self
20
+ # @return [MIMETypeRegistry] The MIME type registry for this
21
+ # process.
22
+ def registry
23
+ @@registry ||= MIMETypeRegistry.new
24
+ end
25
+
26
+ # Normally, this method should not be used. It is just for test.
27
+ #
28
+ # @param [MIMETypeRegistry, nil] registry
29
+ # The new MIME type registry for this process.
30
+ # If you specify `nil`, reset the registry.
31
+ def registry=(registry)
32
+ @@registry = registry
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,26 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class TextData < Data
19
+ def initialize(text)
20
+ super()
21
+ self.mime_type = "text/plain"
22
+ self.body = text
23
+ self.size = text.bytesize
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ VERSION = "1.0.0"
19
+ end
@@ -0,0 +1,91 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "stringio"
18
+ require "tempfile"
19
+
20
+ module ChupaText
21
+ class VirtualContent
22
+ KILO_BYTE = 1024
23
+ BUFFER_SIZE = 64 * KILO_BYTE
24
+
25
+ attr_reader :size
26
+ def initialize(input, original_path=nil)
27
+ @file = nil
28
+ @base_name = compute_base_name(original_path)
29
+ chunk = input.read(BUFFER_SIZE) || ""
30
+ if chunk.bytesize != BUFFER_SIZE
31
+ @path = nil
32
+ @body = chunk
33
+ @size = @body.bytesize
34
+ else
35
+ @body = nil
36
+ @size = chunk.bytesize
37
+ setup_file do |file|
38
+ file.write(chunk)
39
+ while (chunk = input.read(BUFFER_SIZE))
40
+ @size += chunk.bytesize
41
+ file.write(chunk)
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ def open(&block)
48
+ if @body
49
+ yield(StringIO.new(@body))
50
+ else
51
+ File.open(path, "rb", &block)
52
+ end
53
+ end
54
+
55
+ def body
56
+ @body ||= open {|file| file.read}
57
+ end
58
+
59
+ def path
60
+ ensure_setup_file do |file|
61
+ file.write(@body)
62
+ end
63
+ @path
64
+ end
65
+
66
+ private
67
+ def compute_base_name(original_path)
68
+ if original_path
69
+ prefix, suffix = File.basename(original_path).split(/(\.[^.]+\z)/)
70
+ if suffix
71
+ [prefix, suffix]
72
+ else
73
+ prefix
74
+ end
75
+ else
76
+ "chupa-text-virtual-content"
77
+ end
78
+ end
79
+
80
+ def ensure_setup_file(&block)
81
+ setup_file(&block) unless @file
82
+ end
83
+
84
+ def setup_file
85
+ @file = Tempfile.new(@base_name)
86
+ @path = @file.path
87
+ yield(@file)
88
+ @file.close
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,46 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ module ChupaText
18
+ class VirtualFileData < Data
19
+ def initialize(uri, input)
20
+ super()
21
+ self.uri = uri
22
+ if @uri
23
+ path = @uri.path
24
+ else
25
+ path = nil
26
+ end
27
+ @content = VirtualContent.new(input, path)
28
+ end
29
+
30
+ def body
31
+ @content.body
32
+ end
33
+
34
+ def size
35
+ @content.size
36
+ end
37
+
38
+ def path
39
+ @content.path
40
+ end
41
+
42
+ def open(&block)
43
+ @content.open(&block)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,178 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "socket"
18
+
19
+ class TestCommandChupaText < Test::Unit::TestCase
20
+ include Helper
21
+
22
+ def setup
23
+ setup_io
24
+ end
25
+
26
+ def setup_io
27
+ @stdin = StringIO.new
28
+ @stdout = StringIO.new
29
+ end
30
+
31
+ private
32
+ def wrap_io
33
+ @original_stdin = $stdin
34
+ @original_stdout = $stdout
35
+ $stdin = @stdin
36
+ $stdout = @stdout
37
+ begin
38
+ yield
39
+ ensure
40
+ $stdin = @original_stdin
41
+ $stdout = @original_stdout
42
+ end
43
+ end
44
+
45
+ def run_command(*arguments)
46
+ succeeded = wrap_io do
47
+ ChupaText::Command::ChupaText.run(*arguments)
48
+ end
49
+ [succeeded, JSON.parse(@stdout.string)]
50
+ end
51
+
52
+ def fixture_path(*components)
53
+ super("command", "chupa-text", *components)
54
+ end
55
+
56
+ sub_test_case("output") do
57
+ sub_test_case("file") do
58
+ def test_single
59
+ body = "Hello\n"
60
+ path = fixture_path("hello.txt").to_s
61
+ assert_equal([
62
+ true,
63
+ {
64
+ "mime-type" => "text/plain",
65
+ "uri" => path,
66
+ "size" => body.bytesize,
67
+ "texts" => [
68
+ {
69
+ "mime-type" => "text/plain",
70
+ "uri" => path,
71
+ "size" => body.bytesize,
72
+ "body" => body,
73
+ },
74
+ ],
75
+ },
76
+ ],
77
+ run_command(path))
78
+ end
79
+ end
80
+
81
+ sub_test_case("URI") do
82
+ def setup
83
+ super
84
+ setup_www_server
85
+ end
86
+
87
+ def teardown
88
+ super
89
+ teardown_www_server
90
+ end
91
+
92
+ def setup_www_server
93
+ @www_server = TCPServer.new("127.0.0.1", 0)
94
+ _, port, host, = @www_server.addr
95
+ @uri = "http://#{host}:#{port}/"
96
+ @www_server_thread = Thread.new do
97
+ client = @www_server.accept
98
+ loop do
99
+ line = client.gets
100
+ break if line.chomp.empty?
101
+ end
102
+ client.print("HTTP/1.1 200 OK\r\n")
103
+ client.print("Content-Type: text/html\r\n")
104
+ client.print("\r\n")
105
+ client.print(@html)
106
+ client.close
107
+ end
108
+ end
109
+
110
+ def teardown_www_server
111
+ @www_server.close
112
+ @www_server_thread.kill
113
+ end
114
+
115
+ def test_single
116
+ @html = "<html><body>Hello</body></html>"
117
+ assert_equal([
118
+ true,
119
+ {
120
+ "mime-type" => "text/html",
121
+ "size" => @html.bytesize,
122
+ "uri" => @uri,
123
+ "texts" => [
124
+ {
125
+ "mime-type" => "text/html",
126
+ "size" => @html.bytesize,
127
+ "uri" => @uri,
128
+ "body" => @html,
129
+ },
130
+ ],
131
+ },
132
+ ],
133
+ run_command(@uri))
134
+ end
135
+ end
136
+
137
+ sub_test_case("standard input") do
138
+ def test_single
139
+ body = "Hello\n"
140
+ @stdin << "Hello\n"
141
+ @stdin.rewind
142
+ assert_equal([
143
+ true,
144
+ {
145
+ "mime-type" => "text/plain",
146
+ "size" => body.bytesize,
147
+ "texts" => [
148
+ {
149
+ "mime-type" => "text/plain",
150
+ "size" => body.bytesize,
151
+ "body" => body,
152
+ },
153
+ ],
154
+ },
155
+ ],
156
+ run_command)
157
+ end
158
+ end
159
+ end
160
+
161
+ sub_test_case("configuration") do
162
+ def test_no_decomposer
163
+ conf = fixture_path("no-decomposer.conf")
164
+ gz = fixture_path("hello.txt.gz")
165
+ assert_equal([
166
+ true,
167
+ {
168
+ "uri" => gz.to_s,
169
+ "mime-type" => "application/x-gzip",
170
+ "size" => gz.stat.size,
171
+ "texts" => [],
172
+ },
173
+ ],
174
+ run_command("--configuration", conf.to_s,
175
+ gz.to_s))
176
+ end
177
+ end
178
+ end