yomu 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +10 -10
- data/lib/yomu.rb +21 -8
- data/lib/yomu/version.rb +1 -1
- data/test/samples/sample filename with spaces.pages +0 -0
- data/test/yomu_test.rb +43 -5
- metadata +5 -2
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Yomu 読む
|
2
|
-
[Yomu](http://github.com/
|
2
|
+
[Yomu](http://erol.github.com/yomu) is a library for extracting text and metadata from files and documents using the [Apache TIKA](http://tika.apache.org/) content analysis toolkit.
|
3
3
|
|
4
4
|
Here are some of the formats supported:
|
5
5
|
|
@@ -10,7 +10,7 @@ Here are some of the formats supported:
|
|
10
10
|
- Rich Text Format (.rtf)
|
11
11
|
- Portable Document Format (.pdf)
|
12
12
|
|
13
|
-
For the complete list of supported formats, please visit the Apache
|
13
|
+
For the complete list of supported formats, please visit the Apache TIKA
|
14
14
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
15
15
|
|
16
16
|
## Installation and Dependencies
|
@@ -27,7 +27,7 @@ Or install it yourself as:
|
|
27
27
|
|
28
28
|
$ gem install yomu
|
29
29
|
|
30
|
-
Yomu packages the Apache
|
30
|
+
**Yomu packages the Apache TIKA application jar and requires a working JRE for it to work.**
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
|
@@ -40,21 +40,21 @@ You can extract text by calling `Yomu.read` directly:
|
|
40
40
|
data = File.read 'sample.pages'
|
41
41
|
text = Yomu.read :text, data
|
42
42
|
|
43
|
-
|
43
|
+
### Reading text from a given filename
|
44
44
|
|
45
45
|
You can also make a new instance of Yomu and pass a filename.
|
46
46
|
|
47
47
|
yomu = Yomu.new 'sample.pages'
|
48
48
|
text = yomu.text
|
49
49
|
|
50
|
-
|
50
|
+
### Reading text from a given URL
|
51
51
|
|
52
52
|
This is useful for reading remote files, like documents hosted on Amazon S3.
|
53
53
|
|
54
54
|
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
55
55
|
text = yomu.text
|
56
56
|
|
57
|
-
|
57
|
+
### Reading text from a stream
|
58
58
|
|
59
59
|
Yomu can also read from a stream or any object that responds to `read`, including Ruby on Rails' and Sinatra's file uploads:
|
60
60
|
|
@@ -66,8 +66,8 @@ Yomu can also read from a stream or any object that responds to `read`, includin
|
|
66
66
|
## Contributing
|
67
67
|
|
68
68
|
1. Fork it
|
69
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
70
|
-
3. Create tests and make them pass (`rake test`)
|
71
|
-
4. Commit your changes (`git commit -am 'Added some feature'`)
|
72
|
-
5. Push to the branch (`git push origin my-new-feature`)
|
69
|
+
2. Create your feature branch ( `git checkout -b my-new-feature` )
|
70
|
+
3. Create tests and make them pass ( `rake test` )
|
71
|
+
4. Commit your changes ( `git commit -am 'Added some feature'` )
|
72
|
+
5. Push to the branch ( `git push origin my-new-feature` )
|
73
73
|
6. Create a new Pull Request
|
data/lib/yomu.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'yomu/version'
|
2
2
|
|
3
3
|
require 'net/http'
|
4
4
|
require 'yaml'
|
@@ -46,11 +46,12 @@ class Yomu
|
|
46
46
|
|
47
47
|
def initialize(input)
|
48
48
|
if input.is_a? String
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
else
|
49
|
+
if input =~ URI::regexp
|
50
|
+
@uri = URI.parse input
|
51
|
+
elsif File.exists? input
|
53
52
|
@path = input
|
53
|
+
else
|
54
|
+
raise Errno::ENOENT.new "missing file or invalid URI - #{input}"
|
54
55
|
end
|
55
56
|
elsif input.respond_to? :read
|
56
57
|
@stream = input
|
@@ -81,16 +82,28 @@ class Yomu
|
|
81
82
|
@metadata = Yomu.read :metadata, data
|
82
83
|
end
|
83
84
|
|
85
|
+
def path?
|
86
|
+
defined? @path
|
87
|
+
end
|
88
|
+
|
89
|
+
def uri?
|
90
|
+
defined? @uri
|
91
|
+
end
|
92
|
+
|
93
|
+
def stream?
|
94
|
+
defined? @stream
|
95
|
+
end
|
96
|
+
|
84
97
|
protected
|
85
98
|
|
86
99
|
def data
|
87
100
|
return @data if defined? @data
|
88
101
|
|
89
|
-
if
|
102
|
+
if path?
|
90
103
|
@data = File.read @path
|
91
|
-
elsif
|
104
|
+
elsif uri?
|
92
105
|
@data = Net::HTTP.get @uri
|
93
|
-
elsif
|
106
|
+
elsif stream?
|
94
107
|
@data = @stream.read
|
95
108
|
end
|
96
109
|
|
data/lib/yomu/version.rb
CHANGED
Binary file
|
data/test/yomu_test.rb
CHANGED
@@ -23,26 +23,64 @@ class YomuTest < MiniTest::Unit::TestCase
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def test_yomu_can_be_initialized_with_a_root_path
|
27
27
|
assert_silent do
|
28
|
-
Yomu.new '
|
28
|
+
yomu = Yomu.new File.join(File.dirname(__FILE__), 'samples/sample.pages')
|
29
|
+
|
30
|
+
assert_block { yomu.path? }
|
31
|
+
assert_block { !yomu.uri? }
|
32
|
+
assert_block { !yomu.stream? }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_yomu_can_be_initialized_with_a_relative_path
|
37
|
+
assert_silent do
|
38
|
+
yomu = Yomu.new 'test/samples/sample.pages'
|
39
|
+
|
40
|
+
assert_block { yomu.path? }
|
41
|
+
assert_block { !yomu.uri? }
|
42
|
+
assert_block { !yomu.stream? }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_yomu_can_be_initialized_with_a_path_with_spaces
|
47
|
+
assert_silent do
|
48
|
+
yomu = Yomu.new 'test/samples/sample filename with spaces.pages'
|
49
|
+
|
50
|
+
assert_block { yomu.path? }
|
51
|
+
assert_block { !yomu.uri? }
|
52
|
+
assert_block { !yomu.stream? }
|
29
53
|
end
|
30
54
|
end
|
31
55
|
|
32
|
-
def
|
56
|
+
def test_yomu_can_be_initialized_with_a_uri
|
33
57
|
assert_silent do
|
34
|
-
Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
58
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
59
|
+
|
60
|
+
assert_block { yomu.uri? }
|
61
|
+
assert_block { !yomu.path? }
|
62
|
+
assert_block { !yomu.stream? }
|
35
63
|
end
|
36
64
|
end
|
37
65
|
|
38
66
|
def test_yomu_can_be_initialized_with_a_stream_or_object_that_can_be_read
|
39
67
|
assert_silent do
|
40
68
|
File.open 'test/samples/sample.pages', 'r' do |file|
|
41
|
-
Yomu.new file
|
69
|
+
yomu = Yomu.new file
|
70
|
+
|
71
|
+
assert_block { yomu.stream? }
|
72
|
+
assert_block { !yomu.path? }
|
73
|
+
assert_block { !yomu.uri? }
|
42
74
|
end
|
43
75
|
end
|
44
76
|
end
|
45
77
|
|
78
|
+
def test_yomu_cannot_be_initialized_with_a_path_to_a_missing_file
|
79
|
+
assert_raises Errno::ENOENT do
|
80
|
+
Yomu.new 'test/sample/missing.pages'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
46
84
|
def test_yomu_cannot_be_initialized_with_other_objects
|
47
85
|
[nil, 1, 1.1].each do |object|
|
48
86
|
assert_raises TypeError do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yomu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Yomu is a library for extracting text and metadata using the Apache TIKA
|
15
15
|
content analysis toolkit.
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- jar/tika-app-1.1.jar
|
29
29
|
- lib/yomu.rb
|
30
30
|
- lib/yomu/version.rb
|
31
|
+
- test/samples/sample filename with spaces.pages
|
31
32
|
- test/samples/sample.pages
|
32
33
|
- test/test_helper.rb
|
33
34
|
- test/yomu_test.rb
|
@@ -58,6 +59,8 @@ specification_version: 3
|
|
58
59
|
summary: Yomu is a library for extracting text and metadata using the Apache TIKA
|
59
60
|
content analysis toolkit.
|
60
61
|
test_files:
|
62
|
+
- test/samples/sample filename with spaces.pages
|
61
63
|
- test/samples/sample.pages
|
62
64
|
- test/test_helper.rb
|
63
65
|
- test/yomu_test.rb
|
66
|
+
has_rdoc:
|