tika-client 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +10 -1
- data/VERSION +1 -1
- data/lib/tika/client.rb +22 -5
- data/lib/tika/request.rb +48 -28
- data/lib/tika/requests.rb +52 -8
- data/spec/fixtures/Lorem_ipsum.docx +0 -0
- data/spec/fixtures/Lorem_ipsum.pdf +0 -0
- data/spec/fixtures/Lorem_ipsum.png +0 -0
- data/spec/fixtures/Lorem_ipsum.tiff +0 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/unit/client_spec.rb +51 -0
- data/spec/unit/request_spec.rb +10 -0
- metadata +14 -3
- data/lib/tika/endpoints.rb +0 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2657a16d46c66487c7e90bb41b17bc18b178e972
|
4
|
+
data.tar.gz: 9386b75938dd03c8622c7c8efb2dd641b3627f52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a205ff0d6422c6098c904f1ed2fa458367de5b991ca8197b54f6fbf64ea093ba5c3a7f3cd7ee85e3d34c9b042dca3df40ee94c4cbd327e0a8da502c7e058f37
|
7
|
+
data.tar.gz: 507f8ebc7ed4aeb4d6171450f7cf8b96b434dc903cc560dbfe3f8d49969474b81826a02ce7bcbf8fbb4ef1a6dac6d7a4bfb64a8942c62914824a781fb3f5e0cb
|
data/Rakefile
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
2
3
|
require "openssl"
|
3
4
|
require "net/http"
|
4
5
|
|
@@ -36,7 +37,10 @@ namespace :tika do
|
|
36
37
|
|
37
38
|
desc "Start Tika server"
|
38
39
|
task :start do
|
39
|
-
if File.exists?(
|
40
|
+
if File.exists?(PID_FILE)
|
41
|
+
pid = File.read(PID_FILE).strip
|
42
|
+
puts "Tika server is already running (PID #{pid})"
|
43
|
+
elsif File.exists?(tika_path)
|
40
44
|
puts "Starting Tika server ..."
|
41
45
|
File.open(PID_FILE, "w") do |pid_file|
|
42
46
|
pid = fork { exec "java -jar #{tika_path}" }
|
@@ -74,3 +78,8 @@ end
|
|
74
78
|
task :download_dir do
|
75
79
|
FileUtils.mkdir(DOWNLOAD_DIR) unless Dir.exists?(DOWNLOAD_DIR)
|
76
80
|
end
|
81
|
+
|
82
|
+
desc "Run all specs in spec directory"
|
83
|
+
RSpec::Core::RakeTask.new(:spec)
|
84
|
+
|
85
|
+
task default: :spec
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/tika/client.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require "net/http"
|
1
2
|
require_relative "configuration"
|
2
3
|
require_relative "requests"
|
3
4
|
|
@@ -19,8 +20,8 @@ module Tika
|
|
19
20
|
attr_reader :host, :port
|
20
21
|
|
21
22
|
def initialize(opts={})
|
22
|
-
@host = opts.fetch(:host, config.host)
|
23
|
-
@port = opts.fetch(:port, config.port)
|
23
|
+
@host = opts.fetch(:host, self.class.config.host)
|
24
|
+
@port = opts.fetch(:port, self.class.config.port)
|
24
25
|
end
|
25
26
|
|
26
27
|
def get_text(opts={})
|
@@ -31,12 +32,28 @@ module Tika
|
|
31
32
|
GetMetadataRequest.execute(connection, opts)
|
32
33
|
end
|
33
34
|
|
34
|
-
|
35
|
+
def get_version
|
36
|
+
GetVersionRequest.execute(connection)
|
37
|
+
end
|
35
38
|
|
36
|
-
def
|
37
|
-
|
39
|
+
def get_mime_types
|
40
|
+
GetMimeTypesRequest.execute(connection)
|
38
41
|
end
|
39
42
|
|
43
|
+
def get_parsers
|
44
|
+
GetParsersRequest.execute(connection)
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_parsers_details
|
48
|
+
GetParsersDetailsRequest.execute(connection)
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_detectors
|
52
|
+
GetDetectorsRequest.execute(connection)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
40
57
|
def connection
|
41
58
|
@connection ||= Net::HTTP.new(host, port)
|
42
59
|
end
|
data/lib/tika/request.rb
CHANGED
@@ -1,59 +1,79 @@
|
|
1
1
|
require "uri"
|
2
|
-
require "net/http"
|
3
2
|
require "delegate"
|
4
3
|
|
5
4
|
module Tika
|
6
5
|
class Request < SimpleDelegator
|
7
6
|
|
8
7
|
class << self
|
9
|
-
attr_accessor :
|
8
|
+
attr_accessor :http_method, :path
|
9
|
+
|
10
|
+
def headers
|
11
|
+
{}
|
12
|
+
end
|
10
13
|
end
|
11
14
|
|
12
|
-
attr_reader :connection
|
15
|
+
attr_reader :connection, :options
|
13
16
|
|
14
|
-
def self.execute(connection,
|
15
|
-
request = new(connection)
|
16
|
-
|
17
|
-
request.execute(opts)
|
17
|
+
def self.execute(connection, options={})
|
18
|
+
request = new(connection, options)
|
19
|
+
request.execute
|
18
20
|
end
|
19
21
|
|
20
|
-
def initialize(connection)
|
22
|
+
def initialize(connection, options={})
|
21
23
|
@connection = connection
|
24
|
+
@options = options
|
22
25
|
super build_request
|
23
|
-
|
26
|
+
handle_options
|
24
27
|
post_initialize
|
25
28
|
end
|
26
29
|
|
27
|
-
def execute
|
28
|
-
connection.start
|
29
|
-
|
30
|
-
self.body = file.read
|
31
|
-
self.content_length = file.size
|
32
|
-
end
|
33
|
-
self.content_type = opts[:content_type] if opts[:content_type]
|
34
|
-
yield self if block_given?
|
35
|
-
conn.request(__getobj__)
|
36
|
-
end
|
30
|
+
def execute
|
31
|
+
response = connection.start { |conn| conn.request(__getobj__) }
|
32
|
+
handle_response(response)
|
37
33
|
end
|
38
34
|
|
39
|
-
def
|
40
|
-
self.class.
|
35
|
+
def uri
|
36
|
+
@uri ||= URI::HTTP.build(host: connection.address, port: connection.port, path: self.class.path)
|
41
37
|
end
|
42
38
|
|
43
|
-
def
|
44
|
-
|
39
|
+
def handle_response(response)
|
40
|
+
response.body
|
45
41
|
end
|
46
42
|
|
47
43
|
private
|
48
44
|
|
49
|
-
def
|
45
|
+
def handle_options
|
46
|
+
add_file if file
|
47
|
+
set_content_type
|
48
|
+
add_headers
|
49
|
+
end
|
50
50
|
|
51
|
-
def
|
52
|
-
|
51
|
+
def set_content_type
|
52
|
+
self.content_type = options[:content_type] if options[:content_type]
|
53
|
+
end
|
54
|
+
|
55
|
+
def add_file
|
56
|
+
self.body = file.read
|
57
|
+
self.content_length = file.size
|
53
58
|
end
|
54
59
|
|
55
|
-
def
|
56
|
-
|
60
|
+
def file
|
61
|
+
options[:file]
|
62
|
+
end
|
63
|
+
|
64
|
+
def headers
|
65
|
+
@headers ||= self.class.headers.merge options.fetch(:headers, {})
|
66
|
+
end
|
67
|
+
|
68
|
+
def add_headers
|
69
|
+
headers.each { |header, value| self[header] = value }
|
70
|
+
end
|
71
|
+
|
72
|
+
# Subclass hook
|
73
|
+
def post_initialize; end
|
74
|
+
|
75
|
+
def build_request
|
76
|
+
self.class.http_method.new(uri)
|
57
77
|
end
|
58
78
|
|
59
79
|
end
|
data/lib/tika/requests.rb
CHANGED
@@ -1,19 +1,63 @@
|
|
1
|
+
require "json"
|
2
|
+
require "net/http"
|
1
3
|
require_relative "request"
|
2
|
-
require_relative "endpoints"
|
3
4
|
|
4
5
|
module Tika
|
5
6
|
module Requests
|
6
7
|
|
7
|
-
|
8
|
+
PUT = Net::HTTP::Put
|
9
|
+
GET = Net::HTTP::Get
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
class TextRequest < Request
|
12
|
+
def self.headers
|
13
|
+
{"Accept" => "text/plain"}
|
14
|
+
end
|
13
15
|
end
|
14
16
|
|
15
|
-
|
16
|
-
|
17
|
+
class JSONRequest < Request
|
18
|
+
def self.headers
|
19
|
+
{"Accept" => "application/json"}
|
20
|
+
end
|
21
|
+
|
22
|
+
def handle_response(response)
|
23
|
+
JSON.load(response.body)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class GetTextRequest < TextRequest
|
28
|
+
self.http_method = PUT
|
29
|
+
self.path = "/tika"
|
30
|
+
end
|
31
|
+
|
32
|
+
class GetMetadataRequest < JSONRequest
|
33
|
+
self.http_method = PUT
|
34
|
+
self.path = "/meta"
|
35
|
+
end
|
36
|
+
|
37
|
+
class GetVersionRequest < Request
|
38
|
+
self.http_method = GET
|
39
|
+
self.path = "/version"
|
40
|
+
end
|
41
|
+
|
42
|
+
class GetMimeTypesRequest < JSONRequest
|
43
|
+
self.http_method = GET
|
44
|
+
self.path = "/mime-types"
|
45
|
+
end
|
46
|
+
|
47
|
+
class GetParsersRequest < JSONRequest
|
48
|
+
self.http_method = GET
|
49
|
+
self.path = "/parsers"
|
50
|
+
end
|
51
|
+
|
52
|
+
class GetParsersDetailsRequest < JSONRequest
|
53
|
+
self.http_method = GET
|
54
|
+
self.path = "/parsers/details"
|
55
|
+
end
|
56
|
+
|
57
|
+
class GetDetectorsRequest < JSONRequest
|
58
|
+
self.http_method = GET
|
59
|
+
self.path = "/detectors"
|
60
|
+
end
|
17
61
|
|
18
62
|
end
|
19
63
|
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/spec/spec_helper.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
require "tika-client"
|
2
|
+
|
3
|
+
FIXTURE_DIR = File.expand_path("../fixtures", __FILE__)
|
4
|
+
|
1
5
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
6
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
7
|
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Tika
|
2
|
+
RSpec.describe Client do
|
3
|
+
|
4
|
+
describe "#get_text" do
|
5
|
+
let(:file) { File.new(File.join(FIXTURE_DIR, "Lorem_ipsum.docx")) }
|
6
|
+
it "should return the text of the file" do
|
7
|
+
text = subject.get_text(file: file, content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
8
|
+
expect(text).to match(/^Lorem ipsum/)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#get_metadata" do
|
13
|
+
let(:file) { File.new(File.join(FIXTURE_DIR, "Lorem_ipsum.pdf")) }
|
14
|
+
it "should return the metadata of the file" do
|
15
|
+
metadata = subject.get_metadata(file: file, content_type: "application/pdf")
|
16
|
+
expect(metadata["Creation-Date"]).to eq("2015-02-15T01:54:41Z")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#get_version" do
|
21
|
+
it "should return the Tika server version" do
|
22
|
+
expect(subject.get_version).to match(/^Apache Tika/)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "#get_mime_types" do
|
27
|
+
it "should return the MIME Types support by the Tika server" do
|
28
|
+
expect(subject.get_mime_types).to have_key("application/pdf")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "#get_parsers" do
|
33
|
+
it "should return the parsers available to the Tika server" do
|
34
|
+
expect(subject.get_parsers["name"]).to eq("org.apache.tika.parser.DefaultParser")
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#get_parsers_details" do
|
39
|
+
it "should return the parsers available to the Tika server and the MIME types they support" do
|
40
|
+
expect(subject.get_parsers_details["name"]).to eq("org.apache.tika.parser.DefaultParser")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "#get_detectors" do
|
45
|
+
it "should return the detectors available to the Tika server" do
|
46
|
+
expect(subject.get_detectors["name"]).to eq("org.apache.tika.detect.DefaultDetector")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tika-client
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- dchandekstark
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -69,10 +69,15 @@ files:
|
|
69
69
|
- lib/tika-client.rb
|
70
70
|
- lib/tika/client.rb
|
71
71
|
- lib/tika/configuration.rb
|
72
|
-
- lib/tika/endpoints.rb
|
73
72
|
- lib/tika/request.rb
|
74
73
|
- lib/tika/requests.rb
|
74
|
+
- spec/fixtures/Lorem_ipsum.docx
|
75
|
+
- spec/fixtures/Lorem_ipsum.pdf
|
76
|
+
- spec/fixtures/Lorem_ipsum.png
|
77
|
+
- spec/fixtures/Lorem_ipsum.tiff
|
75
78
|
- spec/spec_helper.rb
|
79
|
+
- spec/unit/client_spec.rb
|
80
|
+
- spec/unit/request_spec.rb
|
76
81
|
- tika-client.gemspec
|
77
82
|
homepage: https://github.com/duke-libraries/tika-client
|
78
83
|
licenses:
|
@@ -99,4 +104,10 @@ signing_key:
|
|
99
104
|
specification_version: 4
|
100
105
|
summary: Ruby bindings for Apache Tika Server REST API
|
101
106
|
test_files:
|
107
|
+
- spec/fixtures/Lorem_ipsum.docx
|
108
|
+
- spec/fixtures/Lorem_ipsum.pdf
|
109
|
+
- spec/fixtures/Lorem_ipsum.png
|
110
|
+
- spec/fixtures/Lorem_ipsum.tiff
|
102
111
|
- spec/spec_helper.rb
|
112
|
+
- spec/unit/client_spec.rb
|
113
|
+
- spec/unit/request_spec.rb
|
data/lib/tika/endpoints.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require "net/http"
|
2
|
-
|
3
|
-
module Tika
|
4
|
-
module Endpoints
|
5
|
-
|
6
|
-
PUT = Net::HTTP::Put
|
7
|
-
GET = Net::HTTP::Get
|
8
|
-
|
9
|
-
JSON = "application/json"
|
10
|
-
TEXT = "text/plain"
|
11
|
-
|
12
|
-
Endpoint = Struct.new(:request_method, :path, :response_format)
|
13
|
-
|
14
|
-
GetTextEndpoint = Endpoint.new(PUT, "/tika", TEXT)
|
15
|
-
GetMetadataEndpoint = Endpoint.new(PUT, "/meta", JSON)
|
16
|
-
|
17
|
-
end
|
18
|
-
end
|