yomu 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/NOTICE.txt +8 -0
- data/README.md +73 -0
- data/Rakefile +10 -0
- data/jar/tika-app-1.1.jar +0 -0
- data/lib/yomu.rb +99 -0
- data/lib/yomu/version.rb +3 -0
- data/test/samples/sample.pages +0 -0
- data/test/test_helper.rb +2 -0
- data/test/yomu_test.rb +93 -0
- data/yomu.gemspec +17 -0
- metadata +63 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Erol Fornoles
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/NOTICE.txt
ADDED
data/README.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# Yomu 読む
|
2
|
+
[Yomu](http://github.com/Erol/yomu) is a library for extracting text and metadata using the [Apache TIKA](http://tika.apache.org/) content analysis toolkit.
|
3
|
+
|
4
|
+
Here are some of the formats supported:
|
5
|
+
|
6
|
+
- Microsoft Office OLE 2 and Office Open XML Formats (.doc, .docx, .xls, .xlsx,
|
7
|
+
.ppt, .pptx)
|
8
|
+
- OpenOffice.org OpenDocument Formats (.odt, .ods, .odp)
|
9
|
+
- Apple iWorks Formats
|
10
|
+
- Rich Text Format (.rtf)
|
11
|
+
- Portable Document Format (.pdf)
|
12
|
+
|
13
|
+
For the complete list of supported formats, please visit the Apache Tika
|
14
|
+
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
15
|
+
|
16
|
+
## Installation and Dependencies
|
17
|
+
|
18
|
+
Add this line to your application's Gemfile:
|
19
|
+
|
20
|
+
gem 'yomu'
|
21
|
+
|
22
|
+
And then execute:
|
23
|
+
|
24
|
+
$ bundle
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
|
28
|
+
$ gem install yomu
|
29
|
+
|
30
|
+
Yomu packages the Apache Tika application jar and thus requires a working JRE for it to work.
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
If you're not using Bundler, you will need to require Yomu in your application:
|
35
|
+
|
36
|
+
require 'yomu'
|
37
|
+
|
38
|
+
You can extract text by calling `Yomu.read` directly:
|
39
|
+
|
40
|
+
data = File.read 'sample.pages'
|
41
|
+
text = Yomu.read :text, data
|
42
|
+
|
43
|
+
##### Filename
|
44
|
+
|
45
|
+
You can also make a new instance of Yomu and pass a filename.
|
46
|
+
|
47
|
+
yomu = Yomu.new 'sample.pages'
|
48
|
+
text = yomu.text
|
49
|
+
|
50
|
+
##### URL
|
51
|
+
|
52
|
+
This is useful for reading remote files, like documents hosted on Amazon S3.
|
53
|
+
|
54
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
55
|
+
text = yomu.text
|
56
|
+
|
57
|
+
##### Stream
|
58
|
+
|
59
|
+
Yomu can also read from a stream or any object that responds to `read`, including Ruby on Rails' and Sinatra's file uploads:
|
60
|
+
|
61
|
+
post '/:name/:filename' do
|
62
|
+
yomu = Yomu.new params[:data]
|
63
|
+
yomu.text
|
64
|
+
end
|
65
|
+
|
66
|
+
## Contributing
|
67
|
+
|
68
|
+
1. Fork it
|
69
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
70
|
+
3. Create tests and make them pass (`rake test`)
|
71
|
+
4. Commit your changes (`git commit -am 'Added some feature'`)
|
72
|
+
5. Push to the branch (`git push origin my-new-feature`)
|
73
|
+
6. Create a new Pull Request
|
data/Rakefile
ADDED
Binary file
|
data/lib/yomu.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
require "yomu/version"
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
class Yomu
|
7
|
+
GEMPATH = File.dirname(File.dirname(__FILE__))
|
8
|
+
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.1.jar')
|
9
|
+
|
10
|
+
# Read text or metadata from a data buffer.
|
11
|
+
#
|
12
|
+
# data = File.read 'sample.pages'
|
13
|
+
# text = Yomu.read :text, data
|
14
|
+
# metadata = Yomu.read :metadata, data
|
15
|
+
|
16
|
+
def self.read(type, data)
|
17
|
+
switch = case type
|
18
|
+
when :text
|
19
|
+
'-t'
|
20
|
+
when :metadata
|
21
|
+
'-m'
|
22
|
+
end
|
23
|
+
|
24
|
+
result = IO.popen "java -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
|
25
|
+
io.write data
|
26
|
+
io.close_write
|
27
|
+
io.read
|
28
|
+
end
|
29
|
+
|
30
|
+
type == :metadata ? YAML.load(result) : result
|
31
|
+
end
|
32
|
+
|
33
|
+
# Create a new instance of Yomu.
|
34
|
+
#
|
35
|
+
# Using a file path:
|
36
|
+
#
|
37
|
+
# Yomu.new 'sample.pages'
|
38
|
+
#
|
39
|
+
# Using a URL:
|
40
|
+
#
|
41
|
+
# Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
42
|
+
#
|
43
|
+
# Using a stream or object which responds to +read+
|
44
|
+
#
|
45
|
+
# Yomu.new File.open('sample.pages')
|
46
|
+
|
47
|
+
def initialize(input)
|
48
|
+
if input.is_a? String
|
49
|
+
uri = URI.parse input
|
50
|
+
if uri.scheme and uri.host
|
51
|
+
@uri = uri
|
52
|
+
else
|
53
|
+
@path = input
|
54
|
+
end
|
55
|
+
elsif input.respond_to? :read
|
56
|
+
@stream = input
|
57
|
+
else
|
58
|
+
raise TypeError.new "can't read from #{input.class.name}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns the text contents of a Yomu object.
|
63
|
+
#
|
64
|
+
# yomu = Yomu.new 'sample.pages'
|
65
|
+
# yomu.text
|
66
|
+
|
67
|
+
def text
|
68
|
+
return @text if defined? @text
|
69
|
+
|
70
|
+
@text = Yomu.read :text, data
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns the metadata hash of a Yomu object.
|
74
|
+
#
|
75
|
+
# yomu = Yomu.new 'sample.pages'
|
76
|
+
# yomu.metadata['Content-Type']
|
77
|
+
|
78
|
+
def metadata
|
79
|
+
return @metadata if defined? @metadata
|
80
|
+
|
81
|
+
@metadata = Yomu.read :metadata, data
|
82
|
+
end
|
83
|
+
|
84
|
+
protected
|
85
|
+
|
86
|
+
def data
|
87
|
+
return @data if defined? @data
|
88
|
+
|
89
|
+
if defined? @path
|
90
|
+
@data = File.read @path
|
91
|
+
elsif defined? @uri
|
92
|
+
@data = Net::HTTP.get @uri
|
93
|
+
elsif defined? @stream
|
94
|
+
@data = @stream.read
|
95
|
+
end
|
96
|
+
|
97
|
+
@data
|
98
|
+
end
|
99
|
+
end
|
data/lib/yomu/version.rb
ADDED
Binary file
|
data/test/test_helper.rb
ADDED
data/test/yomu_test.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
require_relative 'test_helper.rb'
|
2
|
+
|
3
|
+
require 'yomu.rb'
|
4
|
+
|
5
|
+
class YomuTest < MiniTest::Unit::TestCase
|
6
|
+
def test_yomu_can_read_text
|
7
|
+
data = File.read 'test/samples/sample.pages'
|
8
|
+
text = Yomu.read :text, data
|
9
|
+
|
10
|
+
assert_includes text, 'The quick brown fox jumped over the lazy cat.'
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_yomu_can_read_metadata
|
14
|
+
data = File.read 'test/samples/sample.pages'
|
15
|
+
metadata = Yomu.read :metadata, data
|
16
|
+
|
17
|
+
assert_equal 'application/vnd.apple.pages', metadata['Content-Type']
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_yomu_cannot_be_initialized_without_parameters
|
21
|
+
assert_raises ArgumentError do
|
22
|
+
Yomu.new
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_yomu_can_be_initialized_with_a_path
|
27
|
+
assert_silent do
|
28
|
+
Yomu.new 'test/samples/sample.pages'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_yomu_can_be_initialized_with_a_url
|
33
|
+
assert_silent do
|
34
|
+
Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_yomu_can_be_initialized_with_a_stream_or_object_that_can_be_read
|
39
|
+
assert_silent do
|
40
|
+
File.open 'test/samples/sample.pages', 'r' do |file|
|
41
|
+
Yomu.new file
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_yomu_cannot_be_initialized_with_other_objects
|
47
|
+
[nil, 1, 1.1].each do |object|
|
48
|
+
assert_raises TypeError do
|
49
|
+
Yomu.new object
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_yomu_initialized_with_a_path_can_read_text
|
55
|
+
yomu = Yomu.new 'test/samples/sample.pages'
|
56
|
+
|
57
|
+
assert_includes yomu.text, 'The quick brown fox jumped over the lazy cat.'
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_yomu_initialized_with_a_path_can_read_metadata
|
61
|
+
yomu = Yomu.new 'test/samples/sample.pages'
|
62
|
+
|
63
|
+
assert_equal 'application/vnd.apple.pages', yomu.metadata['Content-Type']
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_yomu_initialized_with_a_url_can_read_text
|
67
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
68
|
+
|
69
|
+
assert_includes yomu.text, 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_yomu_initialized_with_a_url_can_read_metadata
|
73
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
74
|
+
|
75
|
+
assert_equal 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', yomu.metadata['Content-Type']
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_yomu_initialized_with_a_stream_can_read_text
|
79
|
+
File.open 'test/samples/sample.pages', 'rb' do |file|
|
80
|
+
yomu = Yomu.new file
|
81
|
+
|
82
|
+
assert_includes yomu.text, 'The quick brown fox jumped over the lazy cat.'
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_yomu_initialized_with_a_stream_can_read_metadata
|
87
|
+
File.open 'test/samples/sample.pages', 'rb' do |file|
|
88
|
+
yomu = Yomu.new file
|
89
|
+
|
90
|
+
assert_equal 'application/vnd.apple.pages', yomu.metadata['Content-Type']
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/yomu.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/yomu/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Erol Fornoles"]
|
6
|
+
gem.email = ["erol.fornoles@gmail.com"]
|
7
|
+
gem.description = %q{Yomu is a library for extracting text and metadata using the Apache TIKA content analysis toolkit.}
|
8
|
+
gem.summary = %q{Yomu is a library for extracting text and metadata using the Apache TIKA content analysis toolkit.}
|
9
|
+
gem.homepage = "http://github.com/Erol/yomu"
|
10
|
+
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
gem.name = "yomu"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Yomu::VERSION
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: yomu
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Erol Fornoles
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-03-25 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Yomu is a library for extracting text and metadata using the Apache TIKA
|
15
|
+
content analysis toolkit.
|
16
|
+
email:
|
17
|
+
- erol.fornoles@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- .gitignore
|
23
|
+
- Gemfile
|
24
|
+
- LICENSE
|
25
|
+
- NOTICE.txt
|
26
|
+
- README.md
|
27
|
+
- Rakefile
|
28
|
+
- jar/tika-app-1.1.jar
|
29
|
+
- lib/yomu.rb
|
30
|
+
- lib/yomu/version.rb
|
31
|
+
- test/samples/sample.pages
|
32
|
+
- test/test_helper.rb
|
33
|
+
- test/yomu_test.rb
|
34
|
+
- yomu.gemspec
|
35
|
+
homepage: http://github.com/Erol/yomu
|
36
|
+
licenses: []
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
require_paths:
|
40
|
+
- lib
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 1.8.17
|
56
|
+
signing_key:
|
57
|
+
specification_version: 3
|
58
|
+
summary: Yomu is a library for extracting text and metadata using the Apache TIKA
|
59
|
+
content analysis toolkit.
|
60
|
+
test_files:
|
61
|
+
- test/samples/sample.pages
|
62
|
+
- test/test_helper.rb
|
63
|
+
- test/yomu_test.rb
|