rika 1.4.0-java → 1.5.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +1 -1
- data/lib/rika.rb +7 -7
- data/lib/rika/version.rb +1 -1
- data/pom.xml +3 -3
- data/rika.gemspec +2 -2
- data/spec/rika_spec.rb +11 -11
- data/target/dependency/{fontbox-1.8.1.jar → fontbox-1.8.4.jar} +0 -0
- data/target/dependency/{jempbox-1.8.1.jar → jempbox-1.8.4.jar} +0 -0
- data/target/dependency/jhighlight-1.0.jar +0 -0
- data/target/dependency/{pdfbox-1.8.1.jar → pdfbox-1.8.4.jar} +0 -0
- data/target/dependency/{poi-3.9.jar → poi-3.10-beta2.jar} +0 -0
- data/target/dependency/poi-ooxml-3.10-beta2.jar +0 -0
- data/target/dependency/{poi-ooxml-schemas-3.9.jar → poi-ooxml-schemas-3.10-beta2.jar} +0 -0
- data/target/dependency/{poi-scratchpad-3.9.jar → poi-scratchpad-3.10-beta2.jar} +0 -0
- data/target/dependency/{tika-core-1.4.jar → tika-core-1.5.jar} +0 -0
- data/target/dependency/{tika-parsers-1.4.jar → tika-parsers-1.5.jar} +0 -0
- metadata +19 -24
- data/target/dependency/poi-ooxml-3.9.jar +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 03d0a3d8e955115312ddf54329e6a613bf0d58e8
|
4
|
+
data.tar.gz: 6de10264cd6d8791b353f1e2a574f1eaaf982dd9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 56ef6e0f28e6642dd624407335add98dededce244121996428008590bb1120908c4cb4b8b6964458bd032d2c521e985de524f578446c3c0c7102ad8a6b73d9a6
|
7
|
+
data.tar.gz: 67221ab8318af05b90f33cbb7520cc48fa7d474bd5c7170c76f54b268c4ab941b309bbd7c7ac704edd2aeb888634fcf5adf97a1a4d1ea1760d3fda270f6aa300
|
data/Rakefile
CHANGED
data/lib/rika.rb
CHANGED
@@ -5,13 +5,13 @@ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
|
5
5
|
require "rika/version"
|
6
6
|
require 'uri'
|
7
7
|
require 'net/http'
|
8
|
-
require 'java'
|
8
|
+
require 'java'
|
9
9
|
|
10
10
|
Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
|
11
11
|
require jar
|
12
12
|
end
|
13
13
|
|
14
|
-
# Heavily based on the Apache Tika API: http://tika.apache.org/1.
|
14
|
+
# Heavily based on the Apache Tika API: http://tika.apache.org/1.5/api/org/apache/tika/Tika.html
|
15
15
|
module Rika
|
16
16
|
import org.apache.tika.metadata.Metadata
|
17
17
|
import org.apache.tika.Tika
|
@@ -36,7 +36,7 @@ module Rika
|
|
36
36
|
end
|
37
37
|
|
38
38
|
class Parser
|
39
|
-
|
39
|
+
|
40
40
|
def initialize(file_location, max_content_length = -1, detector = DefaultDetector.new)
|
41
41
|
@uri = file_location
|
42
42
|
@tika = Tika.new(detector)
|
@@ -48,14 +48,14 @@ module Rika
|
|
48
48
|
|
49
49
|
def content
|
50
50
|
self.parse
|
51
|
-
@content
|
51
|
+
@content
|
52
52
|
end
|
53
53
|
|
54
54
|
def metadata
|
55
55
|
unless @metadata_ruby
|
56
56
|
self.parse
|
57
57
|
@metadata_ruby = {}
|
58
|
-
|
58
|
+
|
59
59
|
@metadata_java.names.each do |name|
|
60
60
|
@metadata_ruby[name] = @metadata_java.get(name)
|
61
61
|
end
|
@@ -85,7 +85,7 @@ module Rika
|
|
85
85
|
|
86
86
|
def language
|
87
87
|
@lang ||= LanguageIdentifier.new(content)
|
88
|
-
|
88
|
+
|
89
89
|
@lang.language
|
90
90
|
end
|
91
91
|
|
@@ -96,7 +96,7 @@ module Rika
|
|
96
96
|
end
|
97
97
|
|
98
98
|
protected
|
99
|
-
|
99
|
+
|
100
100
|
def parse
|
101
101
|
@content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
102
102
|
end
|
data/lib/rika/version.rb
CHANGED
data/pom.xml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<modelVersion>4.0.0</modelVersion>
|
4
4
|
|
5
5
|
<name>Rika</name>
|
6
|
-
|
6
|
+
|
7
7
|
<groupId>org.rika</groupId>
|
8
8
|
<artifactId>Rika</artifactId>
|
9
9
|
<version>1.0-SNAPSHOT</version>
|
@@ -13,8 +13,8 @@
|
|
13
13
|
<dependency>
|
14
14
|
<groupId>org.apache.tika</groupId>
|
15
15
|
<artifactId>tika-parsers</artifactId>
|
16
|
-
<version>1.
|
16
|
+
<version>1.5</version>
|
17
17
|
<scope>test</scope>
|
18
18
|
</dependency>
|
19
19
|
</dependencies>
|
20
|
-
</project>
|
20
|
+
</project>
|
data/rika.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
16
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
17
|
gem.require_paths = ["lib"]
|
18
|
-
gem.add_development_dependency "rspec", "2.
|
19
|
-
gem.add_development_dependency "rake", "10.
|
18
|
+
gem.add_development_dependency "rspec", "2.14.1"
|
19
|
+
gem.add_development_dependency "rake", "10.3.1"
|
20
20
|
gem.platform = "java"
|
21
21
|
end
|
data/spec/rika_spec.rb
CHANGED
@@ -4,8 +4,8 @@ require 'spec_helper'
|
|
4
4
|
require 'webrick'
|
5
5
|
|
6
6
|
include WEBrick
|
7
|
-
|
8
|
-
describe Rika::Parser do
|
7
|
+
|
8
|
+
describe Rika::Parser do
|
9
9
|
before(:all) do
|
10
10
|
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
11
11
|
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
@@ -13,13 +13,13 @@ describe Rika::Parser do
|
|
13
13
|
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
14
14
|
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
15
15
|
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
16
|
-
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
17
|
-
port =
|
16
|
+
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
17
|
+
port = 50515
|
18
18
|
@url = "http://#{Socket.gethostname}:#{port}"
|
19
19
|
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
20
20
|
@t1 = Thread.new do
|
21
|
-
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
22
|
-
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
21
|
+
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
22
|
+
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
23
23
|
@server.start
|
24
24
|
end
|
25
25
|
@sample_pdf_filespec = file_path("document.pdf")
|
@@ -55,7 +55,7 @@ describe Rika::Parser do
|
|
55
55
|
@docx_parser.content.should == @quote
|
56
56
|
end
|
57
57
|
|
58
|
-
it "should return the content in a pdf file" do
|
58
|
+
it "should return the content in a pdf file" do
|
59
59
|
@pdf_parser.content.should == @quote
|
60
60
|
end
|
61
61
|
|
@@ -70,7 +70,7 @@ describe Rika::Parser do
|
|
70
70
|
|
71
71
|
it "should only return max content length for file over http" do
|
72
72
|
parser = Rika::Parser.new(@url + "/document.pdf", 6)
|
73
|
-
parser.content.should == "First"
|
73
|
+
parser.content.should == "First"
|
74
74
|
end
|
75
75
|
|
76
76
|
it "should be possible to read files over 100k by default" do
|
@@ -88,8 +88,8 @@ describe Rika::Parser do
|
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
91
|
-
# We just test a few of the metadata fields for some common file formats
|
92
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
91
|
+
# We just test a few of the metadata fields for some common file formats
|
92
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
93
93
|
# have tests for all file formats it supports so we won't retest that
|
94
94
|
describe '#metadata' do
|
95
95
|
it "should return nil if metadata field does not exists" do
|
@@ -164,7 +164,7 @@ describe Rika::Parser do
|
|
164
164
|
|
165
165
|
describe '#language' do
|
166
166
|
it "should return the language of the content" do
|
167
|
-
|
167
|
+
|
168
168
|
["en", "de", "fr", "ru", "es"].each do |lang|
|
169
169
|
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
170
170
|
txt.language.should == lang
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
5
|
-
prerelease:
|
4
|
+
version: 1.5.0
|
6
5
|
platform: java
|
7
6
|
authors:
|
8
7
|
- Richard Nyström
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-23 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rspec
|
@@ -17,14 +16,12 @@ dependencies:
|
|
17
16
|
requirements:
|
18
17
|
- - '='
|
19
18
|
- !ruby/object:Gem::Version
|
20
|
-
version: 2.
|
21
|
-
none: false
|
19
|
+
version: 2.14.1
|
22
20
|
requirement: !ruby/object:Gem::Requirement
|
23
21
|
requirements:
|
24
22
|
- - '='
|
25
23
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.
|
27
|
-
none: false
|
24
|
+
version: 2.14.1
|
28
25
|
prerelease: false
|
29
26
|
type: :development
|
30
27
|
- !ruby/object:Gem::Dependency
|
@@ -33,14 +30,12 @@ dependencies:
|
|
33
30
|
requirements:
|
34
31
|
- - '='
|
35
32
|
- !ruby/object:Gem::Version
|
36
|
-
version: 10.
|
37
|
-
none: false
|
33
|
+
version: 10.3.1
|
38
34
|
requirement: !ruby/object:Gem::Requirement
|
39
35
|
requirements:
|
40
36
|
- - '='
|
41
37
|
- !ruby/object:Gem::Version
|
42
|
-
version: 10.
|
43
|
-
none: false
|
38
|
+
version: 10.3.1
|
44
39
|
prerelease: false
|
45
40
|
type: :development
|
46
41
|
description: ' A JRuby wrapper for Apache Tika to extract text and metadata from various
|
@@ -89,24 +84,25 @@ files:
|
|
89
84
|
- target/dependency/commons-compress-1.5.jar
|
90
85
|
- target/dependency/commons-logging-1.1.1.jar
|
91
86
|
- target/dependency/dom4j-1.6.1.jar
|
92
|
-
- target/dependency/fontbox-1.8.
|
87
|
+
- target/dependency/fontbox-1.8.4.jar
|
93
88
|
- target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar
|
94
89
|
- target/dependency/isoparser-1.0-RC-1.jar
|
95
90
|
- target/dependency/jdom-1.0.jar
|
96
|
-
- target/dependency/jempbox-1.8.
|
91
|
+
- target/dependency/jempbox-1.8.4.jar
|
92
|
+
- target/dependency/jhighlight-1.0.jar
|
97
93
|
- target/dependency/juniversalchardet-1.0.3.jar
|
98
94
|
- target/dependency/metadata-extractor-2.6.2.jar
|
99
95
|
- target/dependency/netcdf-4.2-min.jar
|
100
|
-
- target/dependency/pdfbox-1.8.
|
101
|
-
- target/dependency/poi-3.
|
102
|
-
- target/dependency/poi-ooxml-3.
|
103
|
-
- target/dependency/poi-ooxml-schemas-3.
|
104
|
-
- target/dependency/poi-scratchpad-3.
|
96
|
+
- target/dependency/pdfbox-1.8.4.jar
|
97
|
+
- target/dependency/poi-3.10-beta2.jar
|
98
|
+
- target/dependency/poi-ooxml-3.10-beta2.jar
|
99
|
+
- target/dependency/poi-ooxml-schemas-3.10-beta2.jar
|
100
|
+
- target/dependency/poi-scratchpad-3.10-beta2.jar
|
105
101
|
- target/dependency/rome-0.9.jar
|
106
102
|
- target/dependency/slf4j-api-1.5.6.jar
|
107
103
|
- target/dependency/tagsoup-1.2.1.jar
|
108
|
-
- target/dependency/tika-core-1.
|
109
|
-
- target/dependency/tika-parsers-1.
|
104
|
+
- target/dependency/tika-core-1.5.jar
|
105
|
+
- target/dependency/tika-parsers-1.5.jar
|
110
106
|
- target/dependency/vorbis-java-core-0.1-tests.jar
|
111
107
|
- target/dependency/vorbis-java-core-0.1.jar
|
112
108
|
- target/dependency/vorbis-java-tika-0.1.jar
|
@@ -117,6 +113,7 @@ files:
|
|
117
113
|
- target/dependency/xz-1.2.jar
|
118
114
|
homepage: https://github.com/ricn/rika
|
119
115
|
licenses: []
|
116
|
+
metadata: {}
|
120
117
|
post_install_message:
|
121
118
|
rdoc_options: []
|
122
119
|
require_paths:
|
@@ -126,18 +123,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
123
|
- - '>='
|
127
124
|
- !ruby/object:Gem::Version
|
128
125
|
version: '0'
|
129
|
-
none: false
|
130
126
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
127
|
requirements:
|
132
128
|
- - '>='
|
133
129
|
- !ruby/object:Gem::Version
|
134
130
|
version: '0'
|
135
|
-
none: false
|
136
131
|
requirements: []
|
137
132
|
rubyforge_project:
|
138
|
-
rubygems_version: 1.
|
133
|
+
rubygems_version: 2.1.9
|
139
134
|
signing_key:
|
140
|
-
specification_version:
|
135
|
+
specification_version: 4
|
141
136
|
summary: A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
|
142
137
|
test_files:
|
143
138
|
- spec/fixtures/de.txt
|
Binary file
|