ruby_tika_app_lambda 1.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/HISTORY +13 -0
- data/LICENSE +20 -0
- data/README.md +54 -0
- data/Rakefile +3 -0
- data/ext/tika-config.xml +13 -0
- data/lib/ruby_tika_app.rb +79 -0
- data/ruby_tika_app.gemspec +31 -0
- data/spec/docs/cnn.com +1473 -0
- data/spec/docs/graph sampling simplex - 11.pdf +0 -0
- data/spec/docs/news.ycombinator.com +24 -0
- data/spec/ruby_tika_app_spec.rb +122 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/support/test_server.rb +23 -0
- metadata +192 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 941f6c5387c687023b8160fb54b098d3fbb8ed49b8266e272c6880157f773b01
|
4
|
+
data.tar.gz: e54d467f6624d299809213c308e57ef6c3e12fc0389d21596e58e53ea581eb27
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 076f2194cca270d47e458fbffada19bcc03454f285e300cdf51236fb4a88ab0bd18b3285baf90b2c4d17bf1f2e80665737dfdcdb895c73a86d5ad27159c95a27
|
7
|
+
data.tar.gz: 1e6a2d67a050fc0a2b70ae97525400dbe95814ca98ee9fbfb2ab7f50b777506b865dd6898bfccce21174c0fce02d963668ba29f03dbc8142daedbe5c8a8b214d
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/HISTORY
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
1.9.0 - February 4, 2020
|
2
|
+
* Bumped tika to 1.23
|
3
|
+
|
4
|
+
1.0.1 - May 8, 2013
|
5
|
+
* Fixed issue where URLs were not being parsed.
|
6
|
+
|
7
|
+
0.2.0 - November 30, 2011
|
8
|
+
* Fixed open4 bundler issue - file was getting required that needed open4 before add_dependency
|
9
|
+
* Added README info, HISTORY
|
10
|
+
* Added more tests
|
11
|
+
|
12
|
+
0.1.0 - November 29, 2011
|
13
|
+
* Initial release
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011-2020 Chris Parker
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
## Ruby Tika Parser
|
2
|
+
|
3
|
+
### Introduction
|
4
|
+
|
5
|
+
This is a simple frontend to the Java Tika parser command line jar / app.
|
6
|
+
|
7
|
+
It is the same as running:
|
8
|
+
|
9
|
+
java -server -Djava.awt.headless=true -Dfile.encoding=UTF-8 -jar tika-app-1.24.1.jar FileToParse.pdf
|
10
|
+
|
11
|
+
with options like --xml, --text, etc.
|
12
|
+
|
13
|
+
### Installation
|
14
|
+
|
15
|
+
To install, add ruby_tika_app to your _Gemfile_ and run `bundle install`:
|
16
|
+
|
17
|
+
gem 'ruby_tika_app'
|
18
|
+
|
19
|
+
|
20
|
+
### Note about installation
|
21
|
+
|
22
|
+
RubyTikaApp is a pretty big gem since it includes the ruby-tika-app jarfile.
|
23
|
+
It might take a while to install.
|
24
|
+
|
25
|
+
### Usage
|
26
|
+
|
27
|
+
First, you need Java installed. And it needs to be in your $PATH.
|
28
|
+
|
29
|
+
Then:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'ruby_tika_app'
|
33
|
+
|
34
|
+
rta = RubyTikaApp.new("sample_file.pdf")
|
35
|
+
|
36
|
+
puts rta.to_xml # <xml output>
|
37
|
+
|
38
|
+
# You also get to_json, to_text, to_text_main, and to_metadata
|
39
|
+
|
40
|
+
```
|
41
|
+
|
42
|
+
### Testing
|
43
|
+
|
44
|
+
Run:
|
45
|
+
|
46
|
+
bundle exec rspec spec/
|
47
|
+
|
48
|
+
*NOTE*: Since we are using an underlying java library to connect to external
|
49
|
+
URLs we can't use a standard mocking library. The test suite starts a
|
50
|
+
rack-based web server.
|
51
|
+
|
52
|
+
### Contributing
|
53
|
+
|
54
|
+
Fork on GitHub and after you've committed tested patches, send a pull request.
|
data/Rakefile
ADDED
data/ext/tika-config.xml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<!-- Default Parser for most things, except for 2 mime types, and never
|
6
|
+
use the Executable Parser -->
|
7
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
8
|
+
<mime-exclude>image/jpeg</mime-exclude>
|
9
|
+
<mime-exclude>application/x-sqlite3</mime-exclude>
|
10
|
+
<parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
|
11
|
+
</parser>
|
12
|
+
</parsers>
|
13
|
+
</properties>
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Based on the rake remote task code
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'stringio'
|
7
|
+
require 'open4'
|
8
|
+
|
9
|
+
class RubyTikaApp
|
10
|
+
class Error < RuntimeError; end
|
11
|
+
|
12
|
+
class CommandFailedError < Error
|
13
|
+
attr_reader :status
|
14
|
+
def initialize(status)
|
15
|
+
@status = status
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(document)
|
20
|
+
@document = if (document =~ %r{https?:\/\/[\S]+}) == 0
|
21
|
+
document
|
22
|
+
else
|
23
|
+
"file://#{document}"
|
24
|
+
end
|
25
|
+
|
26
|
+
java_cmd = 'java'
|
27
|
+
java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
|
28
|
+
ext_dir = File.join(File.dirname(__FILE__))
|
29
|
+
tika_path = "/opt/tika-app.jar"
|
30
|
+
tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
|
31
|
+
|
32
|
+
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_xml
|
36
|
+
run_tika('--xml')
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_html
|
40
|
+
run_tika('--html')
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_json(*_args)
|
44
|
+
run_tika('--json')
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_text
|
48
|
+
run_tika('--text')
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_text_main
|
52
|
+
run_tika('--text-main')
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_metadata
|
56
|
+
run_tika('--metadata')
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def run_tika(option)
|
62
|
+
final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
|
63
|
+
|
64
|
+
_, stdin, stdout, stderr = Open4.popen4(final_cmd)
|
65
|
+
|
66
|
+
stdout_result = stdout.read.strip
|
67
|
+
stderr_result = stderr.read.strip
|
68
|
+
|
69
|
+
if stdout_result.empty? && !stderr_result.empty?
|
70
|
+
raise(CommandFailedError.new(stderr_result), "execution failed with status #{stderr_result}: #{final_cmd}")
|
71
|
+
end
|
72
|
+
|
73
|
+
stdout_result
|
74
|
+
ensure
|
75
|
+
stdin.close
|
76
|
+
stdout.close
|
77
|
+
stderr.close
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'ruby_tika_app_lambda'
|
7
|
+
s.version = '1.25.0'
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ['Chris Parker', 'Eric Musgrove']
|
10
|
+
s.email = %w[mrcsparker@gmail.com eric.musgrove@stoatlabs.com]
|
11
|
+
s.homepage = 'https://github.com/StoatLabs/ruby_tika_app'
|
12
|
+
s.summary = 'Wrapper around the tika-app jar'
|
13
|
+
s.description = 'Wrapper around the tika-app jar'
|
14
|
+
|
15
|
+
s.files = `git ls-files`.split("\n") +
|
16
|
+
%w[LICENSE README.md HISTORY]
|
17
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
18
|
+
s.require_paths = %w[lib]
|
19
|
+
s.test_files = Dir.glob('spec/**/*')
|
20
|
+
|
21
|
+
s.add_runtime_dependency('open4')
|
22
|
+
|
23
|
+
s.add_development_dependency('bundler', '>= 1.0.15')
|
24
|
+
s.add_development_dependency('json')
|
25
|
+
s.add_development_dependency('pry')
|
26
|
+
s.add_development_dependency('rack')
|
27
|
+
s.add_development_dependency('rake')
|
28
|
+
s.add_development_dependency('rspec', '~> 3.9.0')
|
29
|
+
s.add_development_dependency('simplecov')
|
30
|
+
s.add_development_dependency('thin')
|
31
|
+
end
|