henkei 1.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/.travis.yml +9 -0
- data/Gemfile +4 -0
- data/LICENSE +23 -0
- data/NOTICE.txt +11 -0
- data/README.md +109 -0
- data/Rakefile +8 -0
- data/henkei.gemspec +27 -0
- data/jar/tika-app-1.14.jar +0 -0
- data/lib/henkei.rb +268 -0
- data/lib/henkei/version.rb +3 -0
- data/spec/helper.rb +6 -0
- data/spec/henkei_spec.rb +182 -0
- data/spec/samples/sample filename with spaces.pages +0 -0
- data/spec/samples/sample-metadata-values-with-colons.doc +0 -0
- data/spec/samples/sample.docx +0 -0
- data/spec/samples/sample.pages +0 -0
- metadata +142 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8f90fc4730d0e9fe321311d38bad4b9ab34ccc5b
|
4
|
+
data.tar.gz: 2df0e595e7fdfa1b7e15c986755321dfdcaeafc8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 97bedb4df8bc4665a756b4a5f92f15a74ef11ac415483a5eb0cb6d4d9b368f92edf9d51b9460bfafa3ae25dd68c909b5e12e27f8cf2237298dece177bc793430
|
7
|
+
data.tar.gz: 031e5c953328a5dff2578b13720a49cda89ebc7fc2b26f71619ea8144cbd56032a390c2bdead4230623eaf4b009d0cc4af88d5d6ad648152bc55dbaa8a8d4341
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Copyright (c) 2017 Andrew Bromwich
|
2
|
+
Copyright (c) 2012 Erol Fornoles
|
3
|
+
|
4
|
+
MIT License
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
a copy of this software and associated documentation files (the
|
8
|
+
"Software"), to deal in the Software without restriction, including
|
9
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
the following conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be
|
15
|
+
included in all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
21
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
22
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
23
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/NOTICE.txt
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Henkei
|
2
|
+
Copyright 2017 Andrew Bromwich, released under the MIT license
|
3
|
+
|
4
|
+
Yomu
|
5
|
+
Copyright 2011 Erol Fornoles, released under the MIT license
|
6
|
+
|
7
|
+
Apache Tika
|
8
|
+
Copyright 2011 The Apache Software Foundation
|
9
|
+
|
10
|
+
This product includes software developed at
|
11
|
+
The Apache Software Foundation (http://www.apache.org/).
|
data/README.md
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
[![Travis Build Status](http://img.shields.io/travis/Erol/yomu.svg?style=flat)](https://travis-ci.org/Erol/yomu)
|
2
|
+
[![Code Climate Score](http://img.shields.io/codeclimate/github/Erol/yomu.svg?style=flat)](https://codeclimate.com/github/Erol/yomu)
|
3
|
+
[![Gem Version](http://img.shields.io/gem/v/yomu.svg?style=flat)](#)
|
4
|
+
|
5
|
+
# Henkei 変形
|
6
|
+
|
7
|
+
[Henkei](http://github.com/abrom/henkei) is a library for extracting text and metadata from files and documents using the [Apache Tika](http://tika.apache.org/) content analysis toolkit.
|
8
|
+
|
9
|
+
Here are some of the formats supported:
|
10
|
+
|
11
|
+
- Microsoft Office OLE 2 and Office Open XML Formats (.doc, .docx, .xls, .xlsx,
|
12
|
+
.ppt, .pptx)
|
13
|
+
- OpenOffice.org OpenDocument Formats (.odt, .ods, .odp)
|
14
|
+
- Apple iWorks Formats
|
15
|
+
- Rich Text Format (.rtf)
|
16
|
+
- Portable Document Format (.pdf)
|
17
|
+
|
18
|
+
For the complete list of supported formats, please visit the Apache Tika
|
19
|
+
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
require 'henkei'
|
27
|
+
|
28
|
+
data = File.read 'sample.pages'
|
29
|
+
text = Henkei.read :text, data
|
30
|
+
metadata = Henkei.read :metadata, data
|
31
|
+
mimetype = Henkei.read :mimetype, data
|
32
|
+
```
|
33
|
+
|
34
|
+
### Reading text from a given filename
|
35
|
+
|
36
|
+
Create a new instance of Henkei and pass a filename.
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
henkei = Henkei.new 'sample.pages'
|
40
|
+
text = henkei.text
|
41
|
+
```
|
42
|
+
|
43
|
+
### Reading text from a given URL
|
44
|
+
|
45
|
+
This is useful for reading remote files, like documents hosted on Amazon S3.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
49
|
+
text = henkei.text
|
50
|
+
```
|
51
|
+
|
52
|
+
### Reading text from a stream
|
53
|
+
|
54
|
+
Henkei can also read from a stream or any object that responds to `read`, including file uploads from Ruby on Rails or Sinatra.
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
post '/:name/:filename' do
|
58
|
+
henkei = Henkei.new params[:data][:tempfile]
|
59
|
+
henkei.text
|
60
|
+
end
|
61
|
+
```
|
62
|
+
|
63
|
+
### Reading metadata
|
64
|
+
|
65
|
+
Metadata is returned as a hash.
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
henkei = Henkei.new 'sample.pages'
|
69
|
+
henkei.metadata['Content-Type'] #=> "application/vnd.apple.pages"
|
70
|
+
```
|
71
|
+
|
72
|
+
### Reading MIME types
|
73
|
+
|
74
|
+
MIME type is returned as a MIME::Type object.
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
henkei = Henkei.new 'sample.docx'
|
78
|
+
henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
79
|
+
henkei.mimetype.extensions #=> ['docx']
|
80
|
+
```
|
81
|
+
|
82
|
+
## Installation and Dependencies
|
83
|
+
|
84
|
+
### Java Runtime
|
85
|
+
|
86
|
+
Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
|
87
|
+
|
88
|
+
### Gem
|
89
|
+
|
90
|
+
Add this line to your application's Gemfile:
|
91
|
+
|
92
|
+
gem 'henkei'
|
93
|
+
|
94
|
+
And then execute:
|
95
|
+
|
96
|
+
$ bundle
|
97
|
+
|
98
|
+
Or install it yourself as:
|
99
|
+
|
100
|
+
$ gem install henkei
|
101
|
+
|
102
|
+
## Contributing
|
103
|
+
|
104
|
+
1. Fork it
|
105
|
+
2. Create your feature branch ( `git checkout -b my-new-feature` )
|
106
|
+
3. Create tests and make them pass ( `rake test` )
|
107
|
+
4. Commit your changes ( `git commit -am 'Added some feature'` )
|
108
|
+
5. Push to the branch ( `git push origin my-new-feature` )
|
109
|
+
6. Create a new Pull Request
|
data/Rakefile
ADDED
data/henkei.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'henkei/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'henkei'
|
8
|
+
spec.version = Henkei::VERSION
|
9
|
+
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
10
|
+
spec.email = ['erol.fornoles@gmail.com', 'a.bromwich@gmail.com']
|
11
|
+
spec.description = %q{Read text and metadata from files and documents (.doc, .docx, .pages, .odt, .rtf, .pdf)}
|
12
|
+
spec.summary = %q{Read text and metadata from files and documents (.doc, .docx, .pages, .odt, .rtf, .pdf)}
|
13
|
+
spec.homepage = 'http://github.com/abrom/henkei'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'mime-types', '~> 1.23'
|
22
|
+
spec.add_runtime_dependency 'json', '~> 1.8'
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
25
|
+
spec.add_development_dependency 'rake'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 2.14'
|
27
|
+
end
|
Binary file
|
data/lib/henkei.rb
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
require 'henkei/version'
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'mime/types'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
require 'socket'
|
8
|
+
require 'stringio'
|
9
|
+
|
10
|
+
class Henkei
|
11
|
+
GEMPATH = File.dirname(File.dirname(__FILE__))
|
12
|
+
JARPATH = File.join(Henkei::GEMPATH, 'jar', 'tika-app-1.14.jar')
|
13
|
+
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
14
|
+
|
15
|
+
@@server_port = nil
|
16
|
+
@@server_pid = nil
|
17
|
+
|
18
|
+
# Read text or metadata from a data buffer.
|
19
|
+
#
|
20
|
+
# data = File.read 'sample.pages'
|
21
|
+
# text = Henkei.read :text, data
|
22
|
+
# metadata = Henkei.read :metadata, data
|
23
|
+
|
24
|
+
def self.read(type, data)
|
25
|
+
result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)
|
26
|
+
|
27
|
+
case type
|
28
|
+
when :text
|
29
|
+
result
|
30
|
+
when :html
|
31
|
+
result
|
32
|
+
when :metadata
|
33
|
+
JSON.parse(result)
|
34
|
+
when :mimetype
|
35
|
+
MIME::Types[JSON.parse(result)['Content-Type']].first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self._client_read(type, data)
|
40
|
+
switch = case type
|
41
|
+
when :text
|
42
|
+
'-t'
|
43
|
+
when :html
|
44
|
+
'-h'
|
45
|
+
when :metadata
|
46
|
+
'-m -j'
|
47
|
+
when :mimetype
|
48
|
+
'-m -j'
|
49
|
+
end
|
50
|
+
|
51
|
+
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
52
|
+
io.write data
|
53
|
+
io.close_write
|
54
|
+
io.read
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
def self._server_read(_, data)
|
60
|
+
s = TCPSocket.new('localhost', @@server_port)
|
61
|
+
file = StringIO.new(data, 'r')
|
62
|
+
|
63
|
+
while 1
|
64
|
+
chunk = file.read(65536)
|
65
|
+
break unless chunk
|
66
|
+
s.write(chunk)
|
67
|
+
end
|
68
|
+
|
69
|
+
# tell Tika that we're done sending data
|
70
|
+
s.shutdown(Socket::SHUT_WR)
|
71
|
+
|
72
|
+
resp = ''
|
73
|
+
while 1
|
74
|
+
chunk = s.recv(65536)
|
75
|
+
break if chunk.empty? || !chunk
|
76
|
+
resp << chunk
|
77
|
+
end
|
78
|
+
resp
|
79
|
+
end
|
80
|
+
|
81
|
+
# Create a new instance of Henkei with a given document.
|
82
|
+
#
|
83
|
+
# Using a file path:
|
84
|
+
#
|
85
|
+
# Henkei.new 'sample.pages'
|
86
|
+
#
|
87
|
+
# Using a URL:
|
88
|
+
#
|
89
|
+
# Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
90
|
+
#
|
91
|
+
# From a stream or an object which responds to +read+
|
92
|
+
#
|
93
|
+
# Henkei.new File.open('sample.pages')
|
94
|
+
|
95
|
+
def initialize(input)
|
96
|
+
if input.is_a? String
|
97
|
+
if File.exists? input
|
98
|
+
@path = input
|
99
|
+
elsif input =~ URI::regexp
|
100
|
+
@uri = URI.parse input
|
101
|
+
else
|
102
|
+
raise Errno::ENOENT.new "missing file or invalid URI - #{input}"
|
103
|
+
end
|
104
|
+
elsif input.respond_to? :read
|
105
|
+
@stream = input
|
106
|
+
else
|
107
|
+
raise TypeError.new "can't read from #{input.class.name}"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Returns the text content of the Henkei document.
|
112
|
+
#
|
113
|
+
# henkei = Henkei.new 'sample.pages'
|
114
|
+
# henkei.text
|
115
|
+
|
116
|
+
def text
|
117
|
+
return @text if defined? @text
|
118
|
+
|
119
|
+
@text = Henkei.read :text, data
|
120
|
+
end
|
121
|
+
|
122
|
+
# Returns the text content of the Henkei document in HTML.
|
123
|
+
#
|
124
|
+
# henkei = Henkei.new 'sample.pages'
|
125
|
+
# henkei.html
|
126
|
+
|
127
|
+
def html
|
128
|
+
return @html if defined? @html
|
129
|
+
|
130
|
+
@html = Henkei.read :html, data
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns the metadata hash of the Henkei document.
|
134
|
+
#
|
135
|
+
# henkei = Henkei.new 'sample.pages'
|
136
|
+
# henkei.metadata['Content-Type']
|
137
|
+
|
138
|
+
def metadata
|
139
|
+
return @metadata if defined? @metadata
|
140
|
+
|
141
|
+
@metadata = Henkei.read :metadata, data
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns the mimetype object of the Henkei document.
|
145
|
+
#
|
146
|
+
# henkei = Henkei.new 'sample.docx'
|
147
|
+
# henkei.mimetype.content_type #=> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
148
|
+
# henkei.mimetype.extensions #=> ['docx']
|
149
|
+
|
150
|
+
def mimetype
|
151
|
+
return @mimetype if defined? @mimetype
|
152
|
+
|
153
|
+
type = metadata["Content-Type"].is_a?(Array) ? metadata["Content-Type"].first : metadata["Content-Type"]
|
154
|
+
|
155
|
+
@mimetype = MIME::Types[type].first
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns +true+ if the Henkei document was specified using a file path.
|
159
|
+
#
|
160
|
+
# henkei = Henkei.new 'sample.pages'
|
161
|
+
# henkei.path? #=> true
|
162
|
+
|
163
|
+
|
164
|
+
def creation_date
|
165
|
+
return @creation_date if defined? @creation_date
|
166
|
+
|
167
|
+
if metadata['Creation-Date']
|
168
|
+
@creation_date = Time.parse(metadata['Creation-Date'])
|
169
|
+
else
|
170
|
+
nil
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def path?
|
175
|
+
defined? @path
|
176
|
+
end
|
177
|
+
|
178
|
+
# Returns +true+ if the Henkei document was specified using a URI.
|
179
|
+
#
|
180
|
+
# henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
181
|
+
# henkei.uri? #=> true
|
182
|
+
|
183
|
+
def uri?
|
184
|
+
defined? @uri
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns +true+ if the Henkei document was specified from a stream or an object which responds to +read+.
|
188
|
+
#
|
189
|
+
# file = File.open('sample.pages')
|
190
|
+
# henkei = Henkei.new file
|
191
|
+
# henkei.stream? #=> true
|
192
|
+
|
193
|
+
def stream?
|
194
|
+
defined? @stream
|
195
|
+
end
|
196
|
+
|
197
|
+
# Returns the raw/unparsed content of the Henkei document.
|
198
|
+
#
|
199
|
+
# henkei = Henkei.new 'sample.pages'
|
200
|
+
# henkei.data
|
201
|
+
|
202
|
+
def data
|
203
|
+
return @data if defined? @data
|
204
|
+
|
205
|
+
if path?
|
206
|
+
@data = File.read @path
|
207
|
+
elsif uri?
|
208
|
+
@data = Net::HTTP.get @uri
|
209
|
+
elsif stream?
|
210
|
+
@data = @stream.read
|
211
|
+
end
|
212
|
+
|
213
|
+
@data
|
214
|
+
end
|
215
|
+
|
216
|
+
# Returns pid of Tika server, started as a new spawned process.
|
217
|
+
#
|
218
|
+
# type :html, :text or :metadata
|
219
|
+
# custom_port e.g. 9293
|
220
|
+
#
|
221
|
+
# Henkei.server(:text, 9294)
|
222
|
+
#
|
223
|
+
def self.server(type, custom_port=nil)
|
224
|
+
switch = case type
|
225
|
+
when :text
|
226
|
+
'-t'
|
227
|
+
when :html
|
228
|
+
'-h'
|
229
|
+
when :metadata
|
230
|
+
'-m -j'
|
231
|
+
when :mimetype
|
232
|
+
'-m -j'
|
233
|
+
end
|
234
|
+
|
235
|
+
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
236
|
+
|
237
|
+
@@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} --server --port #{@@server_port} #{switch}")
|
238
|
+
sleep(2) # Give the server 2 seconds to spin up.
|
239
|
+
@@server_pid
|
240
|
+
end
|
241
|
+
|
242
|
+
# Kills server started by Henkei.server
|
243
|
+
#
|
244
|
+
# Always run this when you're done, or else Tika might run until you kill it manually
|
245
|
+
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
246
|
+
# putting this method in the ensure block.
|
247
|
+
#
|
248
|
+
# Henkei.server(:text)
|
249
|
+
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
250
|
+
# begin
|
251
|
+
# my_texts = reports.map{|report_path| Henkei.new(report_path).text }
|
252
|
+
# rescue
|
253
|
+
# ensure
|
254
|
+
# Henkei.kill_server!
|
255
|
+
# end
|
256
|
+
def self.kill_server!
|
257
|
+
if @@server_pid
|
258
|
+
Process.kill('INT', @@server_pid)
|
259
|
+
@@server_pid = nil
|
260
|
+
@@server_port = nil
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def self.java
|
265
|
+
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
266
|
+
end
|
267
|
+
private_class_method :java
|
268
|
+
end
|
data/spec/helper.rb
ADDED
data/spec/henkei_spec.rb
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'helper.rb'
|
2
|
+
require 'henkei'
|
3
|
+
|
4
|
+
describe Henkei do
|
5
|
+
let(:data) { File.read 'spec/samples/sample.docx' }
|
6
|
+
|
7
|
+
before do
|
8
|
+
ENV['JAVA_HOME'] = nil
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '.read' do
|
12
|
+
it 'reads text' do
|
13
|
+
text = Henkei.read :text, data
|
14
|
+
|
15
|
+
expect( text ).to include 'The quick brown fox jumped over the lazy cat.'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'reads metadata' do
|
19
|
+
metadata = Henkei.read :metadata, data
|
20
|
+
|
21
|
+
expect( metadata['Content-Type'] ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'reads metadata values with colons as strings' do
|
25
|
+
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
26
|
+
metadata = Henkei.read :metadata, data
|
27
|
+
|
28
|
+
expect( metadata['dc:title'] ).to eql 'problem: test'
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'reads mimetype' do
|
32
|
+
mimetype = Henkei.read :mimetype, data
|
33
|
+
|
34
|
+
expect( mimetype.content_type ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
35
|
+
expect( mimetype.extensions ).to include 'docx'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '.new' do
|
40
|
+
it 'requires parameters' do
|
41
|
+
expect { Henkei.new }.to raise_error ArgumentError
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'accepts a root path' do
|
45
|
+
henkei = Henkei.new 'spec/samples/sample.pages'
|
46
|
+
|
47
|
+
expect( henkei ).to be_path
|
48
|
+
expect( henkei ).not_to be_uri
|
49
|
+
expect( henkei ).not_to be_stream
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'accepts a relative path' do
|
53
|
+
henkei = Henkei.new 'spec/samples/sample.pages'
|
54
|
+
|
55
|
+
expect( henkei ).to be_path
|
56
|
+
expect( henkei ).not_to be_uri
|
57
|
+
expect( henkei ).not_to be_stream
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'accepts a path with spaces' do
|
61
|
+
henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
|
62
|
+
|
63
|
+
expect( henkei ).to be_path
|
64
|
+
expect( henkei ).not_to be_uri
|
65
|
+
expect( henkei ).not_to be_stream
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'accepts a URI' do
|
69
|
+
henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
70
|
+
|
71
|
+
expect( henkei ).to be_uri
|
72
|
+
expect( henkei ).not_to be_path
|
73
|
+
expect( henkei ).not_to be_stream
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'accepts a stream or object that can be read' do
|
77
|
+
File.open 'spec/samples/sample.pages', 'r' do |file|
|
78
|
+
henkei = Henkei.new file
|
79
|
+
|
80
|
+
expect( henkei ).to be_stream
|
81
|
+
expect( henkei ).not_to be_path
|
82
|
+
expect( henkei ).not_to be_uri
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'refuses a path to a missing file' do
|
87
|
+
expect { Henkei.new 'test/sample/missing.pages'}.to raise_error Errno::ENOENT
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'refuses other objects' do
|
91
|
+
[nil, 1, 1.1].each do |object|
|
92
|
+
expect { Henkei.new object }.to raise_error TypeError
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
describe '.creation_date' do
|
99
|
+
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
100
|
+
it 'should retur Time' do
|
101
|
+
expect( henkei.creation_date ).to be_a Time
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
describe '.java' do
|
106
|
+
specify 'with no specified JAVA_HOME' do
|
107
|
+
expect( Henkei.send(:java) ).to eql 'java'
|
108
|
+
end
|
109
|
+
|
110
|
+
specify 'with a specified JAVA_HOME' do
|
111
|
+
ENV['JAVA_HOME'] = '/path/to/java/home'
|
112
|
+
|
113
|
+
expect( Henkei.send(:java) ).to eql '/path/to/java/home/bin/java'
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
context 'initialized with a given path' do
|
118
|
+
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
119
|
+
|
120
|
+
specify '#text reads text' do
|
121
|
+
expect( henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
122
|
+
end
|
123
|
+
|
124
|
+
specify '#metadata reads metadata' do
|
125
|
+
expect( henkei.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
context 'initialized with a given URI' do
|
130
|
+
let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
131
|
+
|
132
|
+
specify '#text reads text' do
|
133
|
+
expect( henkei.text ).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
134
|
+
end
|
135
|
+
|
136
|
+
specify '#metadata reads metadata' do
|
137
|
+
expect( henkei.metadata['Content-Type'] ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
context 'initialized with a given stream' do
|
142
|
+
let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
|
143
|
+
|
144
|
+
specify '#text reads text' do
|
145
|
+
expect( henkei.text ).to include 'The quick brown fox jumped over the lazy cat.'
|
146
|
+
end
|
147
|
+
|
148
|
+
specify '#metadata reads metadata' do
|
149
|
+
expect( henkei.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
context 'working as server mode' do
|
154
|
+
specify '#starts and kills server' do
|
155
|
+
begin
|
156
|
+
Henkei.server(:text)
|
157
|
+
expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
|
158
|
+
expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
|
159
|
+
|
160
|
+
s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
|
161
|
+
expect(s).to be_a TCPSocket
|
162
|
+
s.close
|
163
|
+
ensure
|
164
|
+
port = Henkei.class_variable_get(:@@server_port)
|
165
|
+
Henkei.kill_server!
|
166
|
+
sleep 2
|
167
|
+
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
specify '#runs samples through server mode' do
|
172
|
+
begin
|
173
|
+
Henkei.server(:text)
|
174
|
+
expect(Henkei.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
|
175
|
+
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
|
176
|
+
expect(Henkei.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
|
177
|
+
ensure
|
178
|
+
Henkei.kill_server!
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: henkei
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.14.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Erol Fornoles
|
8
|
+
- Andrew Bromwich
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2017-02-18 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mime-types
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.23'
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '1.23'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: json
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '1.8'
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '1.8'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: bundler
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '1.3'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '1.3'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: rake
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: rspec
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '2.14'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '2.14'
|
84
|
+
description: Read text and metadata from files and documents (.doc, .docx, .pages,
|
85
|
+
.odt, .rtf, .pdf)
|
86
|
+
email:
|
87
|
+
- erol.fornoles@gmail.com
|
88
|
+
- a.bromwich@gmail.com
|
89
|
+
executables: []
|
90
|
+
extensions: []
|
91
|
+
extra_rdoc_files: []
|
92
|
+
files:
|
93
|
+
- ".gitignore"
|
94
|
+
- ".rspec"
|
95
|
+
- ".travis.yml"
|
96
|
+
- Gemfile
|
97
|
+
- LICENSE
|
98
|
+
- NOTICE.txt
|
99
|
+
- README.md
|
100
|
+
- Rakefile
|
101
|
+
- henkei.gemspec
|
102
|
+
- jar/tika-app-1.14.jar
|
103
|
+
- lib/henkei.rb
|
104
|
+
- lib/henkei/version.rb
|
105
|
+
- spec/helper.rb
|
106
|
+
- spec/henkei_spec.rb
|
107
|
+
- spec/samples/sample filename with spaces.pages
|
108
|
+
- spec/samples/sample-metadata-values-with-colons.doc
|
109
|
+
- spec/samples/sample.docx
|
110
|
+
- spec/samples/sample.pages
|
111
|
+
homepage: http://github.com/abrom/henkei
|
112
|
+
licenses:
|
113
|
+
- MIT
|
114
|
+
metadata: {}
|
115
|
+
post_install_message:
|
116
|
+
rdoc_options: []
|
117
|
+
require_paths:
|
118
|
+
- lib
|
119
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
requirements: []
|
130
|
+
rubyforge_project:
|
131
|
+
rubygems_version: 2.4.8
|
132
|
+
signing_key:
|
133
|
+
specification_version: 4
|
134
|
+
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
135
|
+
.rtf, .pdf)
|
136
|
+
test_files:
|
137
|
+
- spec/helper.rb
|
138
|
+
- spec/henkei_spec.rb
|
139
|
+
- spec/samples/sample filename with spaces.pages
|
140
|
+
- spec/samples/sample-metadata-values-with-colons.doc
|
141
|
+
- spec/samples/sample.docx
|
142
|
+
- spec/samples/sample.pages
|