tika-client 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3bb963d542e83e8d690b3e243bf05a155ec22391
4
- data.tar.gz: 7776e59b7a1210326b8006f3891dfb78f14cf9c8
3
+ metadata.gz: 2657a16d46c66487c7e90bb41b17bc18b178e972
4
+ data.tar.gz: 9386b75938dd03c8622c7c8efb2dd641b3627f52
5
5
  SHA512:
6
- metadata.gz: 09198bec1ef7e2f79d0bee0b2b7762dc438ef3e5f821f52c000f297526a82d6e145d46a8e5b6ad17c9244f1dda5a978e96c9cb0bdc0aacf90269438165f16a5a
7
- data.tar.gz: e058fd18fb370a579d21a58a1c227313d81000bc5a6e27344fb42a0c6a14c6a89644070fc435a5f6a68d1432f42db2ae77ee84848565c981fadaf98f16a08e2e
6
+ metadata.gz: 6a205ff0d6422c6098c904f1ed2fa458367de5b991ca8197b54f6fbf64ea093ba5c3a7f3cd7ee85e3d34c9b042dca3df40ee94c4cbd327e0a8da502c7e058f37
7
+ data.tar.gz: 507f8ebc7ed4aeb4d6171450f7cf8b96b434dc903cc560dbfe3f8d49969474b81826a02ce7bcbf8fbb4ef1a6dac6d7a4bfb64a8942c62914824a781fb3f5e0cb
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
2
3
  require "openssl"
3
4
  require "net/http"
4
5
 
@@ -36,7 +37,10 @@ namespace :tika do
36
37
 
37
38
  desc "Start Tika server"
38
39
  task :start do
39
- if File.exists?(tika_path)
40
+ if File.exists?(PID_FILE)
41
+ pid = File.read(PID_FILE).strip
42
+ puts "Tika server is already running (PID #{pid})"
43
+ elsif File.exists?(tika_path)
40
44
  puts "Starting Tika server ..."
41
45
  File.open(PID_FILE, "w") do |pid_file|
42
46
  pid = fork { exec "java -jar #{tika_path}" }
@@ -74,3 +78,8 @@ end
74
78
  task :download_dir do
75
79
  FileUtils.mkdir(DOWNLOAD_DIR) unless Dir.exists?(DOWNLOAD_DIR)
76
80
  end
81
+
82
+ desc "Run all specs in spec directory"
83
+ RSpec::Core::RakeTask.new(:spec)
84
+
85
+ task default: :spec
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.0
@@ -1,3 +1,4 @@
1
+ require "net/http"
1
2
  require_relative "configuration"
2
3
  require_relative "requests"
3
4
 
@@ -19,8 +20,8 @@ module Tika
19
20
  attr_reader :host, :port
20
21
 
21
22
  def initialize(opts={})
22
- @host = opts.fetch(:host, config.host)
23
- @port = opts.fetch(:port, config.port)
23
+ @host = opts.fetch(:host, self.class.config.host)
24
+ @port = opts.fetch(:port, self.class.config.port)
24
25
  end
25
26
 
26
27
  def get_text(opts={})
@@ -31,12 +32,28 @@ module Tika
31
32
  GetMetadataRequest.execute(connection, opts)
32
33
  end
33
34
 
34
- private
35
+ def get_version
36
+ GetVersionRequest.execute(connection)
37
+ end
35
38
 
36
- def config
37
- self.class.config
39
+ def get_mime_types
40
+ GetMimeTypesRequest.execute(connection)
38
41
  end
39
42
 
43
+ def get_parsers
44
+ GetParsersRequest.execute(connection)
45
+ end
46
+
47
+ def get_parsers_details
48
+ GetParsersDetailsRequest.execute(connection)
49
+ end
50
+
51
+ def get_detectors
52
+ GetDetectorsRequest.execute(connection)
53
+ end
54
+
55
+ private
56
+
40
57
  def connection
41
58
  @connection ||= Net::HTTP.new(host, port)
42
59
  end
@@ -1,59 +1,79 @@
1
1
  require "uri"
2
- require "net/http"
3
2
  require "delegate"
4
3
 
5
4
  module Tika
6
5
  class Request < SimpleDelegator
7
6
 
8
7
  class << self
9
- attr_accessor :endpoint
8
+ attr_accessor :http_method, :path
9
+
10
+ def headers
11
+ {}
12
+ end
10
13
  end
11
14
 
12
- attr_reader :connection
15
+ attr_reader :connection, :options
13
16
 
14
- def self.execute(connection, opts={})
15
- request = new(connection)
16
- yield request if block_given?
17
- request.execute(opts)
17
+ def self.execute(connection, options={})
18
+ request = new(connection, options)
19
+ request.execute
18
20
  end
19
21
 
20
- def initialize(connection)
22
+ def initialize(connection, options={})
21
23
  @connection = connection
24
+ @options = options
22
25
  super build_request
23
- set_defaults
26
+ handle_options
24
27
  post_initialize
25
28
  end
26
29
 
27
- def execute(opts={})
28
- connection.start do |conn|
29
- if file = opts.delete(:file)
30
- self.body = file.read
31
- self.content_length = file.size
32
- end
33
- self.content_type = opts[:content_type] if opts[:content_type]
34
- yield self if block_given?
35
- conn.request(__getobj__)
36
- end
30
+ def execute
31
+ response = connection.start { |conn| conn.request(__getobj__) }
32
+ handle_response(response)
37
33
  end
38
34
 
39
- def endpoint
40
- self.class.endpoint
35
+ def uri
36
+ @uri ||= URI::HTTP.build(host: connection.address, port: connection.port, path: self.class.path)
41
37
  end
42
38
 
43
- def uri
44
- @uri ||= URI::HTTP.build(host: connection.address, port: connection.port, path: endpoint.path)
39
+ def handle_response(response)
40
+ response.body
45
41
  end
46
42
 
47
43
  private
48
44
 
49
- def post_initialize; end
45
+ def handle_options
46
+ add_file if file
47
+ set_content_type
48
+ add_headers
49
+ end
50
50
 
51
- def build_request
52
- endpoint.request_method.new(uri)
51
+ def set_content_type
52
+ self.content_type = options[:content_type] if options[:content_type]
53
+ end
54
+
55
+ def add_file
56
+ self.body = file.read
57
+ self.content_length = file.size
53
58
  end
54
59
 
55
- def set_defaults
56
- self["Accept"] = endpoint.response_format
60
+ def file
61
+ options[:file]
62
+ end
63
+
64
+ def headers
65
+ @headers ||= self.class.headers.merge options.fetch(:headers, {})
66
+ end
67
+
68
+ def add_headers
69
+ headers.each { |header, value| self[header] = value }
70
+ end
71
+
72
+ # Subclass hook
73
+ def post_initialize; end
74
+
75
+ def build_request
76
+ self.class.http_method.new(uri)
57
77
  end
58
78
 
59
79
  end
@@ -1,19 +1,63 @@
1
+ require "json"
2
+ require "net/http"
1
3
  require_relative "request"
2
- require_relative "endpoints"
3
4
 
4
5
  module Tika
5
6
  module Requests
6
7
 
7
- include Endpoints
8
+ PUT = Net::HTTP::Put
9
+ GET = Net::HTTP::Get
8
10
 
9
- def self.request_class(endpoint)
10
- klass = Class.new(Request)
11
- klass.endpoint = endpoint
12
- klass
11
+ class TextRequest < Request
12
+ def self.headers
13
+ {"Accept" => "text/plain"}
14
+ end
13
15
  end
14
16
 
15
- GetTextRequest = request_class GetTextEndpoint
16
- GetMetadataRequest = request_class GetMetadataEndpoint
17
+ class JSONRequest < Request
18
+ def self.headers
19
+ {"Accept" => "application/json"}
20
+ end
21
+
22
+ def handle_response(response)
23
+ JSON.load(response.body)
24
+ end
25
+ end
26
+
27
+ class GetTextRequest < TextRequest
28
+ self.http_method = PUT
29
+ self.path = "/tika"
30
+ end
31
+
32
+ class GetMetadataRequest < JSONRequest
33
+ self.http_method = PUT
34
+ self.path = "/meta"
35
+ end
36
+
37
+ class GetVersionRequest < Request
38
+ self.http_method = GET
39
+ self.path = "/version"
40
+ end
41
+
42
+ class GetMimeTypesRequest < JSONRequest
43
+ self.http_method = GET
44
+ self.path = "/mime-types"
45
+ end
46
+
47
+ class GetParsersRequest < JSONRequest
48
+ self.http_method = GET
49
+ self.path = "/parsers"
50
+ end
51
+
52
+ class GetParsersDetailsRequest < JSONRequest
53
+ self.http_method = GET
54
+ self.path = "/parsers/details"
55
+ end
56
+
57
+ class GetDetectorsRequest < JSONRequest
58
+ self.http_method = GET
59
+ self.path = "/detectors"
60
+ end
17
61
 
18
62
  end
19
63
  end
@@ -1,3 +1,7 @@
1
+ require "tika-client"
2
+
3
+ FIXTURE_DIR = File.expand_path("../fixtures", __FILE__)
4
+
1
5
  # This file was generated by the `rspec --init` command. Conventionally, all
2
6
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
7
  # The generated `.rspec` file contains `--require spec_helper` which will cause
@@ -0,0 +1,51 @@
1
+ module Tika
2
+ RSpec.describe Client do
3
+
4
+ describe "#get_text" do
5
+ let(:file) { File.new(File.join(FIXTURE_DIR, "Lorem_ipsum.docx")) }
6
+ it "should return the text of the file" do
7
+ text = subject.get_text(file: file, content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
8
+ expect(text).to match(/^Lorem ipsum/)
9
+ end
10
+ end
11
+
12
+ describe "#get_metadata" do
13
+ let(:file) { File.new(File.join(FIXTURE_DIR, "Lorem_ipsum.pdf")) }
14
+ it "should return the metadata of the file" do
15
+ metadata = subject.get_metadata(file: file, content_type: "application/pdf")
16
+ expect(metadata["Creation-Date"]).to eq("2015-02-15T01:54:41Z")
17
+ end
18
+ end
19
+
20
+ describe "#get_version" do
21
+ it "should return the Tika server version" do
22
+ expect(subject.get_version).to match(/^Apache Tika/)
23
+ end
24
+ end
25
+
26
+ describe "#get_mime_types" do
27
+ it "should return the MIME Types support by the Tika server" do
28
+ expect(subject.get_mime_types).to have_key("application/pdf")
29
+ end
30
+ end
31
+
32
+ describe "#get_parsers" do
33
+ it "should return the parsers available to the Tika server" do
34
+ expect(subject.get_parsers["name"]).to eq("org.apache.tika.parser.DefaultParser")
35
+ end
36
+ end
37
+
38
+ describe "#get_parsers_details" do
39
+ it "should return the parsers available to the Tika server and the MIME types they support" do
40
+ expect(subject.get_parsers_details["name"]).to eq("org.apache.tika.parser.DefaultParser")
41
+ end
42
+ end
43
+
44
+ describe "#get_detectors" do
45
+ it "should return the detectors available to the Tika server" do
46
+ expect(subject.get_detectors["name"]).to eq("org.apache.tika.detect.DefaultDetector")
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,10 @@
1
+ module Tika
2
+ RSpec.describe Request do
3
+
4
+ subject { described_class.new(connection, options) }
5
+ let(:connection) { Net::HTTP.new("localhost", 9998) }
6
+
7
+
8
+
9
+ end
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tika-client
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - dchandekstark
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-14 00:00:00.000000000 Z
11
+ date: 2015-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -69,10 +69,15 @@ files:
69
69
  - lib/tika-client.rb
70
70
  - lib/tika/client.rb
71
71
  - lib/tika/configuration.rb
72
- - lib/tika/endpoints.rb
73
72
  - lib/tika/request.rb
74
73
  - lib/tika/requests.rb
74
+ - spec/fixtures/Lorem_ipsum.docx
75
+ - spec/fixtures/Lorem_ipsum.pdf
76
+ - spec/fixtures/Lorem_ipsum.png
77
+ - spec/fixtures/Lorem_ipsum.tiff
75
78
  - spec/spec_helper.rb
79
+ - spec/unit/client_spec.rb
80
+ - spec/unit/request_spec.rb
76
81
  - tika-client.gemspec
77
82
  homepage: https://github.com/duke-libraries/tika-client
78
83
  licenses:
@@ -99,4 +104,10 @@ signing_key:
99
104
  specification_version: 4
100
105
  summary: Ruby bindings for Apache Tika Server REST API
101
106
  test_files:
107
+ - spec/fixtures/Lorem_ipsum.docx
108
+ - spec/fixtures/Lorem_ipsum.pdf
109
+ - spec/fixtures/Lorem_ipsum.png
110
+ - spec/fixtures/Lorem_ipsum.tiff
102
111
  - spec/spec_helper.rb
112
+ - spec/unit/client_spec.rb
113
+ - spec/unit/request_spec.rb
@@ -1,18 +0,0 @@
1
- require "net/http"
2
-
3
- module Tika
4
- module Endpoints
5
-
6
- PUT = Net::HTTP::Put
7
- GET = Net::HTTP::Get
8
-
9
- JSON = "application/json"
10
- TEXT = "text/plain"
11
-
12
- Endpoint = Struct.new(:request_method, :path, :response_format)
13
-
14
- GetTextEndpoint = Endpoint.new(PUT, "/tika", TEXT)
15
- GetMetadataEndpoint = Endpoint.new(PUT, "/meta", JSON)
16
-
17
- end
18
- end