rika 1.4.0-java → 1.5.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Rakefile +1 -1
- data/lib/rika.rb +7 -7
- data/lib/rika/version.rb +1 -1
- data/pom.xml +3 -3
- data/rika.gemspec +2 -2
- data/spec/rika_spec.rb +11 -11
- data/target/dependency/{fontbox-1.8.1.jar → fontbox-1.8.4.jar} +0 -0
- data/target/dependency/{jempbox-1.8.1.jar → jempbox-1.8.4.jar} +0 -0
- data/target/dependency/jhighlight-1.0.jar +0 -0
- data/target/dependency/{pdfbox-1.8.1.jar → pdfbox-1.8.4.jar} +0 -0
- data/target/dependency/{poi-3.9.jar → poi-3.10-beta2.jar} +0 -0
- data/target/dependency/poi-ooxml-3.10-beta2.jar +0 -0
- data/target/dependency/{poi-ooxml-schemas-3.9.jar → poi-ooxml-schemas-3.10-beta2.jar} +0 -0
- data/target/dependency/{poi-scratchpad-3.9.jar → poi-scratchpad-3.10-beta2.jar} +0 -0
- data/target/dependency/{tika-core-1.4.jar → tika-core-1.5.jar} +0 -0
- data/target/dependency/{tika-parsers-1.4.jar → tika-parsers-1.5.jar} +0 -0
- metadata +19 -24
- data/target/dependency/poi-ooxml-3.9.jar +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 03d0a3d8e955115312ddf54329e6a613bf0d58e8
|
4
|
+
data.tar.gz: 6de10264cd6d8791b353f1e2a574f1eaaf982dd9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 56ef6e0f28e6642dd624407335add98dededce244121996428008590bb1120908c4cb4b8b6964458bd032d2c521e985de524f578446c3c0c7102ad8a6b73d9a6
|
7
|
+
data.tar.gz: 67221ab8318af05b90f33cbb7520cc48fa7d474bd5c7170c76f54b268c4ab941b309bbd7c7ac704edd2aeb888634fcf5adf97a1a4d1ea1760d3fda270f6aa300
|
data/Rakefile
CHANGED
data/lib/rika.rb
CHANGED
@@ -5,13 +5,13 @@ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
|
5
5
|
require "rika/version"
|
6
6
|
require 'uri'
|
7
7
|
require 'net/http'
|
8
|
-
require 'java'
|
8
|
+
require 'java'
|
9
9
|
|
10
10
|
Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
|
11
11
|
require jar
|
12
12
|
end
|
13
13
|
|
14
|
-
# Heavily based on the Apache Tika API: http://tika.apache.org/1.
|
14
|
+
# Heavily based on the Apache Tika API: http://tika.apache.org/1.5/api/org/apache/tika/Tika.html
|
15
15
|
module Rika
|
16
16
|
import org.apache.tika.metadata.Metadata
|
17
17
|
import org.apache.tika.Tika
|
@@ -36,7 +36,7 @@ module Rika
|
|
36
36
|
end
|
37
37
|
|
38
38
|
class Parser
|
39
|
-
|
39
|
+
|
40
40
|
def initialize(file_location, max_content_length = -1, detector = DefaultDetector.new)
|
41
41
|
@uri = file_location
|
42
42
|
@tika = Tika.new(detector)
|
@@ -48,14 +48,14 @@ module Rika
|
|
48
48
|
|
49
49
|
def content
|
50
50
|
self.parse
|
51
|
-
@content
|
51
|
+
@content
|
52
52
|
end
|
53
53
|
|
54
54
|
def metadata
|
55
55
|
unless @metadata_ruby
|
56
56
|
self.parse
|
57
57
|
@metadata_ruby = {}
|
58
|
-
|
58
|
+
|
59
59
|
@metadata_java.names.each do |name|
|
60
60
|
@metadata_ruby[name] = @metadata_java.get(name)
|
61
61
|
end
|
@@ -85,7 +85,7 @@ module Rika
|
|
85
85
|
|
86
86
|
def language
|
87
87
|
@lang ||= LanguageIdentifier.new(content)
|
88
|
-
|
88
|
+
|
89
89
|
@lang.language
|
90
90
|
end
|
91
91
|
|
@@ -96,7 +96,7 @@ module Rika
|
|
96
96
|
end
|
97
97
|
|
98
98
|
protected
|
99
|
-
|
99
|
+
|
100
100
|
def parse
|
101
101
|
@content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
102
102
|
end
|
data/lib/rika/version.rb
CHANGED
data/pom.xml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<modelVersion>4.0.0</modelVersion>
|
4
4
|
|
5
5
|
<name>Rika</name>
|
6
|
-
|
6
|
+
|
7
7
|
<groupId>org.rika</groupId>
|
8
8
|
<artifactId>Rika</artifactId>
|
9
9
|
<version>1.0-SNAPSHOT</version>
|
@@ -13,8 +13,8 @@
|
|
13
13
|
<dependency>
|
14
14
|
<groupId>org.apache.tika</groupId>
|
15
15
|
<artifactId>tika-parsers</artifactId>
|
16
|
-
<version>1.
|
16
|
+
<version>1.5</version>
|
17
17
|
<scope>test</scope>
|
18
18
|
</dependency>
|
19
19
|
</dependencies>
|
20
|
-
</project>
|
20
|
+
</project>
|
data/rika.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
16
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
17
|
gem.require_paths = ["lib"]
|
18
|
-
gem.add_development_dependency "rspec", "2.
|
19
|
-
gem.add_development_dependency "rake", "10.
|
18
|
+
gem.add_development_dependency "rspec", "2.14.1"
|
19
|
+
gem.add_development_dependency "rake", "10.3.1"
|
20
20
|
gem.platform = "java"
|
21
21
|
end
|
data/spec/rika_spec.rb
CHANGED
@@ -4,8 +4,8 @@ require 'spec_helper'
|
|
4
4
|
require 'webrick'
|
5
5
|
|
6
6
|
include WEBrick
|
7
|
-
|
8
|
-
describe Rika::Parser do
|
7
|
+
|
8
|
+
describe Rika::Parser do
|
9
9
|
before(:all) do
|
10
10
|
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
11
11
|
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
@@ -13,13 +13,13 @@ describe Rika::Parser do
|
|
13
13
|
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
14
14
|
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
15
15
|
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
16
|
-
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
17
|
-
port =
|
16
|
+
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
17
|
+
port = 50515
|
18
18
|
@url = "http://#{Socket.gethostname}:#{port}"
|
19
19
|
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
20
20
|
@t1 = Thread.new do
|
21
|
-
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
22
|
-
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
21
|
+
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
22
|
+
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
23
23
|
@server.start
|
24
24
|
end
|
25
25
|
@sample_pdf_filespec = file_path("document.pdf")
|
@@ -55,7 +55,7 @@ describe Rika::Parser do
|
|
55
55
|
@docx_parser.content.should == @quote
|
56
56
|
end
|
57
57
|
|
58
|
-
it "should return the content in a pdf file" do
|
58
|
+
it "should return the content in a pdf file" do
|
59
59
|
@pdf_parser.content.should == @quote
|
60
60
|
end
|
61
61
|
|
@@ -70,7 +70,7 @@ describe Rika::Parser do
|
|
70
70
|
|
71
71
|
it "should only return max content length for file over http" do
|
72
72
|
parser = Rika::Parser.new(@url + "/document.pdf", 6)
|
73
|
-
parser.content.should == "First"
|
73
|
+
parser.content.should == "First"
|
74
74
|
end
|
75
75
|
|
76
76
|
it "should be possible to read files over 100k by default" do
|
@@ -88,8 +88,8 @@ describe Rika::Parser do
|
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
91
|
-
# We just test a few of the metadata fields for some common file formats
|
92
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
91
|
+
# We just test a few of the metadata fields for some common file formats
|
92
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
93
93
|
# have tests for all file formats it supports so we won't retest that
|
94
94
|
describe '#metadata' do
|
95
95
|
it "should return nil if metadata field does not exists" do
|
@@ -164,7 +164,7 @@ describe Rika::Parser do
|
|
164
164
|
|
165
165
|
describe '#language' do
|
166
166
|
it "should return the language of the content" do
|
167
|
-
|
167
|
+
|
168
168
|
["en", "de", "fr", "ru", "es"].each do |lang|
|
169
169
|
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
170
170
|
txt.language.should == lang
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
5
|
-
prerelease:
|
4
|
+
version: 1.5.0
|
6
5
|
platform: java
|
7
6
|
authors:
|
8
7
|
- Richard Nyström
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-23 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rspec
|
@@ -17,14 +16,12 @@ dependencies:
|
|
17
16
|
requirements:
|
18
17
|
- - '='
|
19
18
|
- !ruby/object:Gem::Version
|
20
|
-
version: 2.
|
21
|
-
none: false
|
19
|
+
version: 2.14.1
|
22
20
|
requirement: !ruby/object:Gem::Requirement
|
23
21
|
requirements:
|
24
22
|
- - '='
|
25
23
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.
|
27
|
-
none: false
|
24
|
+
version: 2.14.1
|
28
25
|
prerelease: false
|
29
26
|
type: :development
|
30
27
|
- !ruby/object:Gem::Dependency
|
@@ -33,14 +30,12 @@ dependencies:
|
|
33
30
|
requirements:
|
34
31
|
- - '='
|
35
32
|
- !ruby/object:Gem::Version
|
36
|
-
version: 10.
|
37
|
-
none: false
|
33
|
+
version: 10.3.1
|
38
34
|
requirement: !ruby/object:Gem::Requirement
|
39
35
|
requirements:
|
40
36
|
- - '='
|
41
37
|
- !ruby/object:Gem::Version
|
42
|
-
version: 10.
|
43
|
-
none: false
|
38
|
+
version: 10.3.1
|
44
39
|
prerelease: false
|
45
40
|
type: :development
|
46
41
|
description: ' A JRuby wrapper for Apache Tika to extract text and metadata from various
|
@@ -89,24 +84,25 @@ files:
|
|
89
84
|
- target/dependency/commons-compress-1.5.jar
|
90
85
|
- target/dependency/commons-logging-1.1.1.jar
|
91
86
|
- target/dependency/dom4j-1.6.1.jar
|
92
|
-
- target/dependency/fontbox-1.8.
|
87
|
+
- target/dependency/fontbox-1.8.4.jar
|
93
88
|
- target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar
|
94
89
|
- target/dependency/isoparser-1.0-RC-1.jar
|
95
90
|
- target/dependency/jdom-1.0.jar
|
96
|
-
- target/dependency/jempbox-1.8.
|
91
|
+
- target/dependency/jempbox-1.8.4.jar
|
92
|
+
- target/dependency/jhighlight-1.0.jar
|
97
93
|
- target/dependency/juniversalchardet-1.0.3.jar
|
98
94
|
- target/dependency/metadata-extractor-2.6.2.jar
|
99
95
|
- target/dependency/netcdf-4.2-min.jar
|
100
|
-
- target/dependency/pdfbox-1.8.
|
101
|
-
- target/dependency/poi-3.
|
102
|
-
- target/dependency/poi-ooxml-3.
|
103
|
-
- target/dependency/poi-ooxml-schemas-3.
|
104
|
-
- target/dependency/poi-scratchpad-3.
|
96
|
+
- target/dependency/pdfbox-1.8.4.jar
|
97
|
+
- target/dependency/poi-3.10-beta2.jar
|
98
|
+
- target/dependency/poi-ooxml-3.10-beta2.jar
|
99
|
+
- target/dependency/poi-ooxml-schemas-3.10-beta2.jar
|
100
|
+
- target/dependency/poi-scratchpad-3.10-beta2.jar
|
105
101
|
- target/dependency/rome-0.9.jar
|
106
102
|
- target/dependency/slf4j-api-1.5.6.jar
|
107
103
|
- target/dependency/tagsoup-1.2.1.jar
|
108
|
-
- target/dependency/tika-core-1.
|
109
|
-
- target/dependency/tika-parsers-1.
|
104
|
+
- target/dependency/tika-core-1.5.jar
|
105
|
+
- target/dependency/tika-parsers-1.5.jar
|
110
106
|
- target/dependency/vorbis-java-core-0.1-tests.jar
|
111
107
|
- target/dependency/vorbis-java-core-0.1.jar
|
112
108
|
- target/dependency/vorbis-java-tika-0.1.jar
|
@@ -117,6 +113,7 @@ files:
|
|
117
113
|
- target/dependency/xz-1.2.jar
|
118
114
|
homepage: https://github.com/ricn/rika
|
119
115
|
licenses: []
|
116
|
+
metadata: {}
|
120
117
|
post_install_message:
|
121
118
|
rdoc_options: []
|
122
119
|
require_paths:
|
@@ -126,18 +123,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
123
|
- - '>='
|
127
124
|
- !ruby/object:Gem::Version
|
128
125
|
version: '0'
|
129
|
-
none: false
|
130
126
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
127
|
requirements:
|
132
128
|
- - '>='
|
133
129
|
- !ruby/object:Gem::Version
|
134
130
|
version: '0'
|
135
|
-
none: false
|
136
131
|
requirements: []
|
137
132
|
rubyforge_project:
|
138
|
-
rubygems_version: 1.
|
133
|
+
rubygems_version: 2.1.9
|
139
134
|
signing_key:
|
140
|
-
specification_version:
|
135
|
+
specification_version: 4
|
141
136
|
summary: A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
|
142
137
|
test_files:
|
143
138
|
- spec/fixtures/de.txt
|
Binary file
|