tikas 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm gemset use tikas
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :rubygems
2
+
3
+ # Specify your gem's dependencies in tika.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2011 Julio Arias
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,12 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rake/testtask'
4
+
5
+ task :default => [:test_units]
6
+
7
+ desc "Run tests"
8
+ Rake::TestTask.new do |t|
9
+ t.libs << "test"
10
+ t.test_files = FileList['test/*test.rb']
11
+ t.verbose = true
12
+ end
@@ -0,0 +1,80 @@
1
+ # What is this?
2
+
3
+ A Sinatra service around Apache Tika content extraction project
4
+
5
+ # Requisites
6
+
7
+ TikaS requires nailgun to be installed
8
+
9
+ On OS X (using hombrew or follow the Linux instructions):
10
+
11
+ $ brew install nailgun
12
+
13
+ On Linux:
14
+
15
+ 1- Download [Nailgun](http://sourceforge.net/projects/nailgun/files/nailgun/0.7.1/)
16
+
17
+ 2- Build the ng client
18
+
19
+ $ cd nailgun_folder ; make
20
+
21
+ 3- Add ng to your PATH
22
+
23
+ 4- Create a ng-server script and put in your PATH
24
+
25
+ #!/bin/bash
26
+ exec java -server -jar "/path/to/nailgun/jar/nailgun-0.7.1.jar"
27
+
28
+ # Usage:
29
+
30
+ $ gem install tikas
31
+
32
+ or
33
+
34
+ $ bundle install # if using bundler
35
+
36
+ In your config.ru
37
+
38
+ require 'rubygems' # require bundler too if your using it
39
+ require 'tikas'
40
+
41
+ run TikaS::TikaServer
42
+
43
+ Call the service
44
+
45
+ * To get the file text content:
46
+
47
+ $ curl -v -F "data=@your_file.ext" http://localhost:9292/extract
48
+
49
+ * To get the file metadata only
50
+
51
+ $ curl -v -F "data=@your_file.ext" http://localhost:9292/metadata
52
+
53
+ # TODO:
54
+
55
+ * Test tests and more tests :D
56
+
57
+ # LICENSE:
58
+
59
+ Copyright (c) 2011 Julio Arias
60
+
61
+ Permission is hereby granted, free of charge, to any person
62
+ obtaining a copy of this software and associated documentation
63
+ files (the "Software"), to deal in the Software without
64
+ restriction, including without limitation the rights to use,
65
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
66
+ copies of the Software, and to permit persons to whom the
67
+ Software is furnished to do so, subject to the following
68
+ conditions:
69
+
70
+ The above copyright notice and this permission notice shall be
71
+ included in all copies or substantial portions of the Software.
72
+
73
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
74
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
75
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
76
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
77
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
78
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
79
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
80
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ $:.unshift File.expand_path(File.dirname(__FILE__))
5
+
6
+ require 'lib/tikas'
7
+
8
+ run TikaS::TikaServer
Binary file
@@ -0,0 +1,70 @@
1
+ require 'tikas/version'
2
+ require 'sinatra'
3
+ require 'json'
4
+
5
+ module TikaS
6
+ class TikaServer < Sinatra::Base
7
+ TIKA_JAR = File.join(File.dirname(__FILE__), "tika-app.jar")
8
+
9
+ configure do
10
+ # Start nailgun assuming is installed with hombrew or similar ng-server script exists in PATH
11
+ `nohup ng-server > /dev/null 2>&1 &`
12
+
13
+ sleep 0.5 # Just to let nailgun start properly
14
+
15
+ # Add tika-app.jar to nailgun classpath
16
+ `ng ng-cp #{TIKA_JAR}`
17
+ # Create an alias to TikaCLI class
18
+ `ng ng-alias tika org.apache.tika.cli.TikaCLI`
19
+ end
20
+
21
+ get '/' do
22
+ <<-EOF
23
+ <p>
24
+ Welcome to TikaS a simple server for extracting document content and metadata.
25
+ </p>
26
+ <p>
27
+ TikaS uses Apache Tika to do the extraction so any file type sopported by Tika is supported by TikaS
28
+ </p>
29
+ <p>
30
+ Sample usage:
31
+ <br/>
32
+ <br/>
33
+ $ curl -v -F "data=@your_file.ext" #{request.url}extract
34
+ <br/>
35
+ $ curl -v -F "data=@your_file.ext" #{request.url}metadata
36
+ <br/>
37
+ </p>
38
+ <p>
39
+ More info <a href="https://github.com/jarias/tikas">TikaS</a>
40
+ </p>
41
+ EOF
42
+ end
43
+
44
+ post '/extract' do
45
+ content_type 'application/json'
46
+ begin
47
+ {:data => `ng tika -t #{params[:data][:tempfile].path}`, :api_version => VERSION}.to_json
48
+ rescue Exception => e
49
+ [500, {:error => e.to_s, :api_version => VERSION}.to_json]
50
+ end
51
+ end
52
+
53
+ post '/metadata' do
54
+ content_type 'application/json'
55
+ begin
56
+ metadata = {}
57
+ m = `ng tika -t -m #{params[:data][:tempfile].path}`
58
+ m.split("\n").map {|l| l.split ':'}.each {|e| metadata[e[0]] = e[1]}
59
+ metadata["resourceName"] = params[:data][:filename]
60
+ {:metadata => metadata, :api_version => VERSION}.to_json
61
+ rescue Exception => e
62
+ [500, {:error => e.to_s, :api_version => VERSION}.to_json]
63
+ end
64
+ end
65
+
66
+ at_exit do
67
+ `ng ng-stop`
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,3 @@
1
+ module TikaS
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'test/unit'
5
+ require 'rack/test'
6
+
7
+ $:.push File.expand_path("../..", __FILE__)
8
+
9
+ require 'lib/tikas'
@@ -0,0 +1,8 @@
1
+ require 'test_helper'
2
+
3
+ class TikasTest < Test::Unit::TestCase
4
+ # TODO real tests ;)
5
+ def test_test
6
+ assert true
7
+ end
8
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "tikas/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "tikas"
7
+ s.version = TikaS::VERSION
8
+ s.authors = ["Julio Arias"]
9
+ s.email = ["jarias01@gmail.com"]
10
+ s.homepage = "https://github.com/jarias/tikas"
11
+ s.summary = %q{Sinatra based service around Apache Tika (http://tika.apache.org/)}
12
+ s.description = %q{Sinatra based service around Apache Tika content extraction project}
13
+
14
+ s.rubyforge_project = "tikas"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_dependency "sinatra", "~> 1.2.6"
22
+
23
+ s.add_development_dependency "guard", "~> 0.3.4"
24
+ s.add_development_dependency "rb-fsevent", "~> 0.4.0"
25
+ s.add_development_dependency "rack-test", "~> 0.6.0"
26
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tikas
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Julio Arias
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-11-13 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: sinatra
16
+ requirement: &70143955583440 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.6
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70143955583440
25
+ - !ruby/object:Gem::Dependency
26
+ name: guard
27
+ requirement: &70143955582300 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 0.3.4
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70143955582300
36
+ - !ruby/object:Gem::Dependency
37
+ name: rb-fsevent
38
+ requirement: &70143955581520 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.4.0
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70143955581520
47
+ - !ruby/object:Gem::Dependency
48
+ name: rack-test
49
+ requirement: &70143955580580 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 0.6.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70143955580580
58
+ description: Sinatra based service around Apache Tika content extraction project
59
+ email:
60
+ - jarias01@gmail.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - .gitignore
66
+ - .rvmrc
67
+ - Gemfile
68
+ - LICENSE
69
+ - Rakefile
70
+ - Readme.md
71
+ - config.ru
72
+ - lib/tika-app.jar
73
+ - lib/tikas.rb
74
+ - lib/tikas/version.rb
75
+ - test/test_files/sample.pdf
76
+ - test/test_helper.rb
77
+ - test/tikas_test.rb
78
+ - tikas.gemspec
79
+ homepage: https://github.com/jarias/tikas
80
+ licenses: []
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubyforge_project: tikas
99
+ rubygems_version: 1.8.10
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: Sinatra based service around Apache Tika (http://tika.apache.org/)
103
+ test_files:
104
+ - test/test_files/sample.pdf
105
+ - test/test_helper.rb
106
+ - test/tikas_test.rb