tikas 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.rvmrc +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/Rakefile +12 -0
- data/Readme.md +80 -0
- data/config.ru +8 -0
- data/lib/tika-app.jar +0 -0
- data/lib/tikas.rb +70 -0
- data/lib/tikas/version.rb +3 -0
- data/test/test_files/sample.pdf +0 -0
- data/test/test_helper.rb +9 -0
- data/test/tikas_test.rb +8 -0
- data/tikas.gemspec +26 -0
- metadata +106 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm gemset use tikas
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2011 Julio Arias
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
4
|
+
obtaining a copy of this software and associated documentation
|
5
|
+
files (the "Software"), to deal in the Software without
|
6
|
+
restriction, including without limitation the rights to use,
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the
|
9
|
+
Software is furnished to do so, subject to the following
|
10
|
+
conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
data/Readme.md
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# What is this?
|
2
|
+
|
3
|
+
A Sinatra service around Apache Tika content extraction project
|
4
|
+
|
5
|
+
# Requisites
|
6
|
+
|
7
|
+
TikaS requires nailgun to be installed
|
8
|
+
|
9
|
+
On OS X (using hombrew or follow the Linux instructions):
|
10
|
+
|
11
|
+
$ brew install nailgun
|
12
|
+
|
13
|
+
On Linux:
|
14
|
+
|
15
|
+
1- Download [Nailgun](http://sourceforge.net/projects/nailgun/files/nailgun/0.7.1/)
|
16
|
+
|
17
|
+
2- Build the ng client
|
18
|
+
|
19
|
+
$ cd nailgun_folder ; make
|
20
|
+
|
21
|
+
3- Add ng to your PATH
|
22
|
+
|
23
|
+
4- Create a ng-server script and put in your PATH
|
24
|
+
|
25
|
+
#!/bin/bash
|
26
|
+
exec java -server -jar "/path/to/nailgun/jar/nailgun-0.7.1.jar"
|
27
|
+
|
28
|
+
# Usage:
|
29
|
+
|
30
|
+
$ gem install tikas
|
31
|
+
|
32
|
+
or
|
33
|
+
|
34
|
+
$ bundle install # if using bundler
|
35
|
+
|
36
|
+
In your config.ru
|
37
|
+
|
38
|
+
require 'rubygems' # require bundler too if your using it
|
39
|
+
require 'tikas'
|
40
|
+
|
41
|
+
run TikaS::TikaServer
|
42
|
+
|
43
|
+
Call the service
|
44
|
+
|
45
|
+
* To get the file text content:
|
46
|
+
|
47
|
+
$ curl -v -F "data=@your_file.ext" http://localhost:9292/extract
|
48
|
+
|
49
|
+
* To get the file metadata only
|
50
|
+
|
51
|
+
$ curl -v -F "data=@your_file.ext" http://localhost:9292/metadata
|
52
|
+
|
53
|
+
# TODO:
|
54
|
+
|
55
|
+
* Test tests and more tests :D
|
56
|
+
|
57
|
+
# LICENSE:
|
58
|
+
|
59
|
+
Copyright (c) 2011 Julio Arias
|
60
|
+
|
61
|
+
Permission is hereby granted, free of charge, to any person
|
62
|
+
obtaining a copy of this software and associated documentation
|
63
|
+
files (the "Software"), to deal in the Software without
|
64
|
+
restriction, including without limitation the rights to use,
|
65
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
66
|
+
copies of the Software, and to permit persons to whom the
|
67
|
+
Software is furnished to do so, subject to the following
|
68
|
+
conditions:
|
69
|
+
|
70
|
+
The above copyright notice and this permission notice shall be
|
71
|
+
included in all copies or substantial portions of the Software.
|
72
|
+
|
73
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
74
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
75
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
76
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
77
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
78
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
79
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
80
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/config.ru
ADDED
data/lib/tika-app.jar
ADDED
Binary file
|
data/lib/tikas.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'tikas/version'
|
2
|
+
require 'sinatra'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module TikaS
|
6
|
+
class TikaServer < Sinatra::Base
|
7
|
+
TIKA_JAR = File.join(File.dirname(__FILE__), "tika-app.jar")
|
8
|
+
|
9
|
+
configure do
|
10
|
+
# Start nailgun assuming is installed with hombrew or similar ng-server script exists in PATH
|
11
|
+
`nohup ng-server > /dev/null 2>&1 &`
|
12
|
+
|
13
|
+
sleep 0.5 # Just to let nailgun start properly
|
14
|
+
|
15
|
+
# Add tika-app.jar to nailgun classpath
|
16
|
+
`ng ng-cp #{TIKA_JAR}`
|
17
|
+
# Create an alias to TikaCLI class
|
18
|
+
`ng ng-alias tika org.apache.tika.cli.TikaCLI`
|
19
|
+
end
|
20
|
+
|
21
|
+
get '/' do
|
22
|
+
<<-EOF
|
23
|
+
<p>
|
24
|
+
Welcome to TikaS a simple server for extracting document content and metadata.
|
25
|
+
</p>
|
26
|
+
<p>
|
27
|
+
TikaS uses Apache Tika to do the extraction so any file type sopported by Tika is supported by TikaS
|
28
|
+
</p>
|
29
|
+
<p>
|
30
|
+
Sample usage:
|
31
|
+
<br/>
|
32
|
+
<br/>
|
33
|
+
$ curl -v -F "data=@your_file.ext" #{request.url}extract
|
34
|
+
<br/>
|
35
|
+
$ curl -v -F "data=@your_file.ext" #{request.url}metadata
|
36
|
+
<br/>
|
37
|
+
</p>
|
38
|
+
<p>
|
39
|
+
More info <a href="https://github.com/jarias/tikas">TikaS</a>
|
40
|
+
</p>
|
41
|
+
EOF
|
42
|
+
end
|
43
|
+
|
44
|
+
post '/extract' do
|
45
|
+
content_type 'application/json'
|
46
|
+
begin
|
47
|
+
{:data => `ng tika -t #{params[:data][:tempfile].path}`, :api_version => VERSION}.to_json
|
48
|
+
rescue Exception => e
|
49
|
+
[500, {:error => e.to_s, :api_version => VERSION}.to_json]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
post '/metadata' do
|
54
|
+
content_type 'application/json'
|
55
|
+
begin
|
56
|
+
metadata = {}
|
57
|
+
m = `ng tika -t -m #{params[:data][:tempfile].path}`
|
58
|
+
m.split("\n").map {|l| l.split ':'}.each {|e| metadata[e[0]] = e[1]}
|
59
|
+
metadata["resourceName"] = params[:data][:filename]
|
60
|
+
{:metadata => metadata, :api_version => VERSION}.to_json
|
61
|
+
rescue Exception => e
|
62
|
+
[500, {:error => e.to_s, :api_version => VERSION}.to_json]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
at_exit do
|
67
|
+
`ng ng-stop`
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
Binary file
|
data/test/test_helper.rb
ADDED
data/test/tikas_test.rb
ADDED
data/tikas.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "tikas/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "tikas"
|
7
|
+
s.version = TikaS::VERSION
|
8
|
+
s.authors = ["Julio Arias"]
|
9
|
+
s.email = ["jarias01@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/jarias/tikas"
|
11
|
+
s.summary = %q{Sinatra based service around Apache Tika (http://tika.apache.org/)}
|
12
|
+
s.description = %q{Sinatra based service around Apache Tika content extraction project}
|
13
|
+
|
14
|
+
s.rubyforge_project = "tikas"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_dependency "sinatra", "~> 1.2.6"
|
22
|
+
|
23
|
+
s.add_development_dependency "guard", "~> 0.3.4"
|
24
|
+
s.add_development_dependency "rb-fsevent", "~> 0.4.0"
|
25
|
+
s.add_development_dependency "rack-test", "~> 0.6.0"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tikas
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Julio Arias
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-13 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: sinatra
|
16
|
+
requirement: &70143955583440 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.6
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70143955583440
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: guard
|
27
|
+
requirement: &70143955582300 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.3.4
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70143955582300
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rb-fsevent
|
38
|
+
requirement: &70143955581520 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.4.0
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70143955581520
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rack-test
|
49
|
+
requirement: &70143955580580 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.6.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70143955580580
|
58
|
+
description: Sinatra based service around Apache Tika content extraction project
|
59
|
+
email:
|
60
|
+
- jarias01@gmail.com
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- .gitignore
|
66
|
+
- .rvmrc
|
67
|
+
- Gemfile
|
68
|
+
- LICENSE
|
69
|
+
- Rakefile
|
70
|
+
- Readme.md
|
71
|
+
- config.ru
|
72
|
+
- lib/tika-app.jar
|
73
|
+
- lib/tikas.rb
|
74
|
+
- lib/tikas/version.rb
|
75
|
+
- test/test_files/sample.pdf
|
76
|
+
- test/test_helper.rb
|
77
|
+
- test/tikas_test.rb
|
78
|
+
- tikas.gemspec
|
79
|
+
homepage: https://github.com/jarias/tikas
|
80
|
+
licenses: []
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ! '>='
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
requirements: []
|
98
|
+
rubyforge_project: tikas
|
99
|
+
rubygems_version: 1.8.10
|
100
|
+
signing_key:
|
101
|
+
specification_version: 3
|
102
|
+
summary: Sinatra based service around Apache Tika (http://tika.apache.org/)
|
103
|
+
test_files:
|
104
|
+
- test/test_files/sample.pdf
|
105
|
+
- test/test_helper.rb
|
106
|
+
- test/tikas_test.rb
|