ruby_tika_app_lambda 1.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 941f6c5387c687023b8160fb54b098d3fbb8ed49b8266e272c6880157f773b01
4
+ data.tar.gz: e54d467f6624d299809213c308e57ef6c3e12fc0389d21596e58e53ea581eb27
5
+ SHA512:
6
+ metadata.gz: 076f2194cca270d47e458fbffada19bcc03454f285e300cdf51236fb4a88ab0bd18b3285baf90b2c4d17bf1f2e80665737dfdcdb895c73a86d5ad27159c95a27
7
+ data.tar.gz: 1e6a2d67a050fc0a2b70ae97525400dbe95814ca98ee9fbfb2ab7f50b777506b865dd6898bfccce21174c0fce02d963668ba29f03dbc8142daedbe5c8a8b214d
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .sublime-project
6
+ .sublime-project.sublime-workspace
7
+ .rbenv-version
8
+ .idea
9
+ coverage/*
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --colour --profile
2
+ --format documentation
3
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'http://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in ruby_tika_app.gemspec
6
+ gemspec
data/HISTORY ADDED
@@ -0,0 +1,13 @@
1
+ 1.9.0 - February 4, 2020
2
+ * Bumped tika to 1.23
3
+
4
+ 1.0.1 - May 8, 2013
5
+ * Fixed issue where URLs were not being parsed.
6
+
7
+ 0.2.0 - November 30, 2011
8
+ * Fixed open4 bundler issue - file was getting required that needed open4 before add_dependency
9
+ * Added README info, HISTORY
10
+ * Added more tests
11
+
12
+ 0.1.0 - November 29, 2011
13
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011-2020 Chris Parker
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,54 @@
1
+ ## Ruby Tika Parser
2
+
3
+ ### Introduction
4
+
5
+ This is a simple frontend to the Java Tika parser command line jar / app.
6
+
7
+ It is the same as running:
8
+
9
+ java -server -Djava.awt.headless=true -Dfile.encoding=UTF-8 -jar tika-app-1.24.1.jar FileToParse.pdf
10
+
11
+ with options like --xml, --text, etc.
12
+
13
+ ### Installation
14
+
15
+ To install, add ruby_tika_app to your _Gemfile_ and run `bundle install`:
16
+
17
+ gem 'ruby_tika_app'
18
+
19
+
20
+ ### Note about installation
21
+
22
+ RubyTikaApp is a pretty big gem since it includes the ruby-tika-app jarfile.
23
+ It might take a while to install.
24
+
25
+ ### Usage
26
+
27
+ First, you need Java installed. And it needs to be in your $PATH.
28
+
29
+ Then:
30
+
31
+ ```ruby
32
+ require 'ruby_tika_app'
33
+
34
+ rta = RubyTikaApp.new("sample_file.pdf")
35
+
36
+ puts rta.to_xml # <xml output>
37
+
38
+ # You also get to_json, to_text, to_text_main, and to_metadata
39
+
40
+ ```
41
+
42
+ ### Testing
43
+
44
+ Run:
45
+
46
+ bundle exec rspec spec/
47
+
48
+ *NOTE*: Since we are using an underlying java library to connect to external
49
+ URLs we can't use a standard mocking library. The test suite starts a
50
+ rack-based web server.
51
+
52
+ ### Contributing
53
+
54
+ Fork on GitHub and after you've committed tested patches, send a pull request.
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
@@ -0,0 +1,13 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <properties>
3
+ <service-loader initializableProblemHandler="ignore"/>
4
+ <parsers>
5
+ <!-- Default Parser for most things, except for 2 mime types, and never
6
+ use the Executable Parser -->
7
+ <parser class="org.apache.tika.parser.DefaultParser">
8
+ <mime-exclude>image/jpeg</mime-exclude>
9
+ <mime-exclude>application/x-sqlite3</mime-exclude>
10
+ <parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
11
+ </parser>
12
+ </parsers>
13
+ </properties>
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Based on the rake remote task code
4
+
5
+ require 'rubygems'
6
+ require 'stringio'
7
+ require 'open4'
8
+
9
+ class RubyTikaApp
10
+ class Error < RuntimeError; end
11
+
12
+ class CommandFailedError < Error
13
+ attr_reader :status
14
+ def initialize(status)
15
+ @status = status
16
+ end
17
+ end
18
+
19
+ def initialize(document)
20
+ @document = if (document =~ %r{https?:\/\/[\S]+}) == 0
21
+ document
22
+ else
23
+ "file://#{document}"
24
+ end
25
+
26
+ java_cmd = 'java'
27
+ java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
28
+ ext_dir = File.join(File.dirname(__FILE__))
29
+ tika_path = "/opt/tika-app.jar"
30
+ tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
31
+
32
+ @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
33
+ end
34
+
35
+ def to_xml
36
+ run_tika('--xml')
37
+ end
38
+
39
+ def to_html
40
+ run_tika('--html')
41
+ end
42
+
43
+ def to_json(*_args)
44
+ run_tika('--json')
45
+ end
46
+
47
+ def to_text
48
+ run_tika('--text')
49
+ end
50
+
51
+ def to_text_main
52
+ run_tika('--text-main')
53
+ end
54
+
55
+ def to_metadata
56
+ run_tika('--metadata')
57
+ end
58
+
59
+ private
60
+
61
+ def run_tika(option)
62
+ final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
63
+
64
+ _, stdin, stdout, stderr = Open4.popen4(final_cmd)
65
+
66
+ stdout_result = stdout.read.strip
67
+ stderr_result = stderr.read.strip
68
+
69
+ if stdout_result.empty? && !stderr_result.empty?
70
+ raise(CommandFailedError.new(stderr_result), "execution failed with status #{stderr_result}: #{final_cmd}")
71
+ end
72
+
73
+ stdout_result
74
+ ensure
75
+ stdin.close
76
+ stdout.close
77
+ stderr.close
78
+ end
79
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.push File.expand_path('lib', __dir__)
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'ruby_tika_app_lambda'
7
+ s.version = '1.25.0'
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ['Chris Parker', 'Eric Musgrove']
10
+ s.email = %w[mrcsparker@gmail.com eric.musgrove@stoatlabs.com]
11
+ s.homepage = 'https://github.com/StoatLabs/ruby_tika_app'
12
+ s.summary = 'Wrapper around the tika-app jar'
13
+ s.description = 'Wrapper around the tika-app jar'
14
+
15
+ s.files = `git ls-files`.split("\n") +
16
+ %w[LICENSE README.md HISTORY]
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
18
+ s.require_paths = %w[lib]
19
+ s.test_files = Dir.glob('spec/**/*')
20
+
21
+ s.add_runtime_dependency('open4')
22
+
23
+ s.add_development_dependency('bundler', '>= 1.0.15')
24
+ s.add_development_dependency('json')
25
+ s.add_development_dependency('pry')
26
+ s.add_development_dependency('rack')
27
+ s.add_development_dependency('rake')
28
+ s.add_development_dependency('rspec', '~> 3.9.0')
29
+ s.add_development_dependency('simplecov')
30
+ s.add_development_dependency('thin')
31
+ end