rika 1.1.1-java → 1.11.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +3 -3
- data/README.md +82 -40
- data/RELEASE_NOTES.md +17 -0
- data/Rakefile +1 -1
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/lib/rika.rb +18 -93
- data/lib/rika/parser.rb +90 -0
- data/lib/rika/version.rb +1 -1
- data/pom.xml +4 -4
- data/rika.gemspec +9 -7
- data/rika_helper.rb +38 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/text_file.txt +23 -1
- data/spec/fixtures/text_file_without_extension +23 -1
- data/spec/rika_spec.rb +153 -101
- data/spec/spec_helper.rb +4 -3
- metadata +36 -76
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-3.1.jar +0 -0
- data/target/dependency/aspectjrt-1.6.11.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.5.jar +0 -0
- data/target/dependency/commons-compress-1.4.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/dom4j-1.6.1.jar +0 -0
- data/target/dependency/fontbox-1.7.1.jar +0 -0
- data/target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/target/dependency/isoparser-1.0-RC-1.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.7.1.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2-min.jar +0 -0
- data/target/dependency/pdfbox-1.7.1.jar +0 -0
- data/target/dependency/poi-3.8.jar +0 -0
- data/target/dependency/poi-ooxml-3.8.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.8.jar +0 -0
- data/target/dependency/poi-scratchpad-3.8.jar +0 -0
- data/target/dependency/rome-0.9.jar +0 -0
- data/target/dependency/slf4j-api-1.5.6.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.3.jar +0 -0
- data/target/dependency/tika-parsers-1.3.jar +0 -0
- data/target/dependency/vorbis-java-core-0.1-tests.jar +0 -0
- data/target/dependency/vorbis-java-core-0.1.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.1.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.3.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.0.jar +0 -0
data/pom.xml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<modelVersion>4.0.0</modelVersion>
|
4
4
|
|
5
5
|
<name>Rika</name>
|
6
|
-
|
6
|
+
|
7
7
|
<groupId>org.rika</groupId>
|
8
8
|
<artifactId>Rika</artifactId>
|
9
9
|
<version>1.0-SNAPSHOT</version>
|
@@ -12,9 +12,9 @@
|
|
12
12
|
<dependencies>
|
13
13
|
<dependency>
|
14
14
|
<groupId>org.apache.tika</groupId>
|
15
|
-
<artifactId>tika-
|
16
|
-
<version>1.
|
15
|
+
<artifactId>tika-app</artifactId>
|
16
|
+
<version>1.24</version>
|
17
17
|
<scope>test</scope>
|
18
18
|
</dependency>
|
19
19
|
</dependencies>
|
20
|
-
</project>
|
20
|
+
</project>
|
data/rika.gemspec
CHANGED
@@ -6,16 +6,18 @@ require 'rika/version'
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
7
|
gem.name = "rika"
|
8
8
|
gem.version = Rika::VERSION
|
9
|
-
gem.authors = ["Richard Nyström"]
|
10
|
-
gem.email = ["ricny046@gmail.com"]
|
11
|
-
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various
|
12
|
-
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various
|
13
|
-
gem.homepage = "https://github.com/
|
9
|
+
gem.authors = ["Richard Nyström", "Keith Bennett"]
|
10
|
+
gem.email = ["ricny046@gmail.com", "keithrbennett@gmail.com"]
|
11
|
+
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
|
12
|
+
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
|
13
|
+
gem.homepage = "https://github.com/keithrbennett/rika"
|
14
14
|
gem.files = `git ls-files`.split($/)
|
15
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
16
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
17
|
gem.require_paths = ["lib"]
|
18
|
-
gem.add_development_dependency "rspec", "
|
19
|
-
gem.add_development_dependency "rake", "
|
18
|
+
gem.add_development_dependency "rspec", "~> 3.9"
|
19
|
+
gem.add_development_dependency "rake", "~> 13.0"
|
20
20
|
gem.platform = "java"
|
21
|
+
gem.license = "Apache-2.0"
|
21
22
|
end
|
23
|
+
|
data/rika_helper.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Defines some shortcuts for ad-hoc work with Rika.
|
2
|
+
#
|
3
|
+
# Can be used with the `irb`/`jirb` or `pry` (https://github.com/pry/pry) interactive shells:
|
4
|
+
# `pry -r rika_helper.rb`
|
5
|
+
#
|
6
|
+
# Can be used with the `rexe` command line executor (https://github.com/keithrbennett/rexe):
|
7
|
+
# rexe -r ./rika_helper.rb # e.g., add: `-oa 'm "x.pdf"'` to output metadata w/AwesomePrint
|
8
|
+
#
|
9
|
+
# or plain Ruby:
|
10
|
+
# ruby -r ./rika_helper -r awesome_print -e 'ap m("x.pdf")'
|
11
|
+
|
12
|
+
require 'rika'
|
13
|
+
|
14
|
+
def c(resource)
|
15
|
+
Rika.parse_content(resource)
|
16
|
+
end
|
17
|
+
|
18
|
+
def m(resource)
|
19
|
+
Rika.parse_metadata(resource)
|
20
|
+
end
|
21
|
+
|
22
|
+
def cm(resource)
|
23
|
+
Rika.parse_content_and_metadata(resource)
|
24
|
+
end
|
25
|
+
|
26
|
+
def cmh(resource)
|
27
|
+
Rika.parse_content_and_metadata_as_hash(resource)
|
28
|
+
end
|
29
|
+
|
30
|
+
def mj(resource); m(resource).to_json ; end
|
31
|
+
def mJ(resource); JSON.pretty_generate(m(resource)) ; end
|
32
|
+
def my(resource); m(resource).to_yaml ; end
|
33
|
+
def my(resource); require 'awesome_print'; m(resource).ai ;end
|
34
|
+
|
35
|
+
def cmj(resource); c(resource).to_json; end
|
36
|
+
def cmJ(resource); JSON.pretty_generate(c(resource)); end
|
37
|
+
def cmy(resource); c(resource).to_yaml ; end
|
38
|
+
def cma(resource); require 'awesome_print'; c,m = cm(resource); { content: c, metadata: m }; end
|
data/spec/fixtures/de.txt
CHANGED
@@ -1 +1,21 @@
|
|
1
|
-
|
1
|
+
Eines verschneiten Abends im Wald verhaltend (German)
|
2
|
+
|
3
|
+
Wohl weiß ich, wer den Wald besitzt,
|
4
|
+
Sein Haus jedoch im Dorf dort ist;
|
5
|
+
Er merkt nicht, wie ich hier verharr'
|
6
|
+
Zu schaun, wie Wald den Schnee begrüßt.
|
7
|
+
|
8
|
+
Mein Pferdchen denkt, ich wär ein Narr,
|
9
|
+
Zu halten, wo nicht Haus noch Farm,
|
10
|
+
Im Wald, nah beim gefrornen Teich,
|
11
|
+
Am finstersten Abend im Jahr.
|
12
|
+
|
13
|
+
Des Zaumzeugs Glöckchen klingen leis:
|
14
|
+
Es fragt: was ist? geht's weiter gleich?
|
15
|
+
Ansonsten hört man nur ganz sacht
|
16
|
+
Den Schneefall und den Wind, der streicht.
|
17
|
+
|
18
|
+
Der Wald ist nett, tief wie die Nacht,
|
19
|
+
Doch halte ich, was ich versprach,
|
20
|
+
Und Meilen sind's noch bis zum Schlaf,
|
21
|
+
Und Meilen sind's noch bis zum Schlaf.
|
data/spec/fixtures/document.doc
CHANGED
Binary file
|
data/spec/fixtures/document.docx
CHANGED
Binary file
|
data/spec/fixtures/document.pdf
CHANGED
Binary file
|
data/spec/fixtures/en.txt
CHANGED
@@ -1 +1,23 @@
|
|
1
|
-
|
1
|
+
Stopping by Woods on a Snowy Evening
|
2
|
+
|
3
|
+
By Robert Frost
|
4
|
+
|
5
|
+
Whose woods these are I think I know.
|
6
|
+
His house is in the village though;
|
7
|
+
He will not see me stopping here
|
8
|
+
To watch his woods fill up with snow.
|
9
|
+
|
10
|
+
My little horse must think it queer
|
11
|
+
To stop without a farmhouse near
|
12
|
+
Between the woods and frozen lake
|
13
|
+
The darkest evening of the year.
|
14
|
+
|
15
|
+
He gives his harness bells a shake
|
16
|
+
To ask if there is some mistake.
|
17
|
+
The only other sound’s the sweep
|
18
|
+
Of easy wind and downy flake.
|
19
|
+
|
20
|
+
The woods are lovely, dark and deep,
|
21
|
+
But I have promises to keep,
|
22
|
+
And miles to go before I sleep,
|
23
|
+
And miles to go before I sleep.
|
data/spec/fixtures/es.txt
CHANGED
@@ -1 +1,21 @@
|
|
1
|
-
|
1
|
+
Un alto en el bosque mientras nieva
|
2
|
+
|
3
|
+
De quién es este bosque, saber creo
|
4
|
+
-en el poblado su morada veo-
|
5
|
+
no habrá de sorprenderme contemplando
|
6
|
+
cubrir su bosque el invernal blanqueo.
|
7
|
+
|
8
|
+
Mi caballito se dirá extrañado
|
9
|
+
que, sin granja cercana, hemos parado
|
10
|
+
de este año en la tarde más oscura,
|
11
|
+
entre el bosque y el lago congelado.
|
12
|
+
|
13
|
+
Sacudiéndose, agita su cencerro
|
14
|
+
preguntando quizá: -¿será algún yerro?
|
15
|
+
Sólo el cierzo y los copos rumorean
|
16
|
+
blandamente del bosque en el encierro.
|
17
|
+
|
18
|
+
Yo, el bosque hondo y fusco veo risueño...
|
19
|
+
Mas, en cumplir promesas tengo empeño,
|
20
|
+
y millas debo andar antes del sueño,
|
21
|
+
un largo andar para llegar al sueño.
|
data/spec/fixtures/fr.txt
CHANGED
@@ -1 +1,23 @@
|
|
1
|
-
En
|
1
|
+
En s'arrêtant par les bois un soir de neige
|
2
|
+
|
3
|
+
Auteur: Robert Frost
|
4
|
+
|
5
|
+
À qui sont ces bois, je crois que je sais.
|
6
|
+
Sa maison est au village pourtant ;
|
7
|
+
Il ne me verra pas m'arrêter ici
|
8
|
+
À regarder ses bois recouverts de neige.
|
9
|
+
|
10
|
+
Mon petit cheval doit trouver cela étrange
|
11
|
+
De s'arrêter loin de toute ferme aux alentours
|
12
|
+
Entre ces bois et ce lac gelé
|
13
|
+
Au soir le plus sombre de l'année.
|
14
|
+
|
15
|
+
Il fait tinter les clochettes de son harnais en tirant dessus
|
16
|
+
Comme pour demander s'il n'y a pas là quelque erreur.
|
17
|
+
Les seuls autres bruits le souffle
|
18
|
+
D'une brise légère et le son duveteux des flocons.
|
19
|
+
|
20
|
+
Les bois sont beaux, sombres et profonds,
|
21
|
+
Mais j'ai des promesses à tenir
|
22
|
+
Et un long chemin à parcourir avant de dormir,
|
23
|
+
Et un long chemin à parcourir avant de dormir.
|
data/spec/fixtures/ru.txt
CHANGED
@@ -1 +1,21 @@
|
|
1
|
-
|
1
|
+
Остановившись у леса снежным вечером
|
2
|
+
|
3
|
+
Чей лес, мне кажется, я знаю:
|
4
|
+
в селе живет его хозяин.
|
5
|
+
Он не увидит, как на снежный
|
6
|
+
я лес его стою взираю.
|
7
|
+
|
8
|
+
В недоуменье конь, конечно,
|
9
|
+
зачем в ночи за год темнейшей
|
10
|
+
мы стали там, где нет жилья,
|
11
|
+
у леса с озером замерзшим.
|
12
|
+
|
13
|
+
Он, бубенцом слегка звеня,
|
14
|
+
как будто бы корит меня,
|
15
|
+
да веет слабый ветерок,
|
16
|
+
пушистым снегом шелестя.
|
17
|
+
|
18
|
+
Лес сладок, темен и глубок,
|
19
|
+
но в путь пора мне — долг есть долг.
|
20
|
+
И ехать долго — сон далек,
|
21
|
+
и ехать долго — сон далек.
|
data/spec/fixtures/text_file.txt
CHANGED
@@ -1 +1,23 @@
|
|
1
|
-
|
1
|
+
Stopping by Woods on a Snowy Evening
|
2
|
+
|
3
|
+
By Robert Frost
|
4
|
+
|
5
|
+
Whose woods these are I think I know.
|
6
|
+
His house is in the village though;
|
7
|
+
He will not see me stopping here
|
8
|
+
To watch his woods fill up with snow.
|
9
|
+
|
10
|
+
My little horse must think it queer
|
11
|
+
To stop without a farmhouse near
|
12
|
+
Between the woods and frozen lake
|
13
|
+
The darkest evening of the year.
|
14
|
+
|
15
|
+
He gives his harness bells a shake
|
16
|
+
To ask if there is some mistake.
|
17
|
+
The only other sound’s the sweep
|
18
|
+
Of easy wind and downy flake.
|
19
|
+
|
20
|
+
The woods are lovely, dark and deep,
|
21
|
+
But I have promises to keep,
|
22
|
+
And miles to go before I sleep,
|
23
|
+
And miles to go before I sleep.
|
@@ -1 +1,23 @@
|
|
1
|
-
|
1
|
+
Stopping by Woods on a Snowy Evening
|
2
|
+
|
3
|
+
By Robert Frost
|
4
|
+
|
5
|
+
Whose woods these are I think I know.
|
6
|
+
His house is in the village though;
|
7
|
+
He will not see me stopping here
|
8
|
+
To watch his woods fill up with snow.
|
9
|
+
|
10
|
+
My little horse must think it queer
|
11
|
+
To stop without a farmhouse near
|
12
|
+
Between the woods and frozen lake
|
13
|
+
The darkest evening of the year.
|
14
|
+
|
15
|
+
He gives his harness bells a shake
|
16
|
+
To ask if there is some mistake.
|
17
|
+
The only other sound’s the sweep
|
18
|
+
Of easy wind and downy flake.
|
19
|
+
|
20
|
+
The woods are lovely, dark and deep,
|
21
|
+
But I have promises to keep,
|
22
|
+
And miles to go before I sleep,
|
23
|
+
And miles to go before I sleep.
|
data/spec/rika_spec.rb
CHANGED
@@ -4,165 +4,193 @@ require 'spec_helper'
|
|
4
4
|
require 'webrick'
|
5
5
|
|
6
6
|
include WEBrick
|
7
|
-
|
8
|
-
describe Rika::Parser do
|
9
|
-
before(:all) do
|
10
|
-
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
11
|
-
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
12
|
-
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
13
|
-
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
14
|
-
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
15
|
-
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
16
|
-
port = 50505
|
17
|
-
@url = "http://#{Socket.gethostname}:#{port}"
|
18
|
-
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
19
|
-
@t1 = Thread.new do
|
20
|
-
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
21
|
-
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
22
|
-
@server.start
|
23
|
-
end
|
24
|
-
@sample_pdf_filespec = file_path("document.pdf")
|
25
|
-
end
|
26
7
|
|
27
|
-
|
28
|
-
|
29
|
-
|
8
|
+
describe Rika::Parser do
|
9
|
+
|
10
|
+
let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
|
11
|
+
let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
|
12
|
+
let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
|
13
|
+
let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
|
14
|
+
let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
|
15
|
+
let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
|
16
|
+
let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
|
17
|
+
let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
18
|
+
|
19
|
+
port = 50515
|
20
|
+
let (:url) { "http://#{Socket.gethostname}:#{port}" }
|
21
|
+
|
22
|
+
let (:sample_pdf_filespec) { file_path('document.pdf') }
|
23
|
+
|
24
|
+
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
25
|
+
|
26
|
+
let(:server_runner) do
|
27
|
+
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
28
|
+
->(action) do
|
29
|
+
server = nil
|
30
|
+
server_thread = Thread.new do
|
31
|
+
server = HTTPServer.new(
|
32
|
+
Port: port,
|
33
|
+
DocumentRoot: dir,
|
34
|
+
AccessLog: [],
|
35
|
+
Logger: WEBrick::Log::new('/dev/null')
|
36
|
+
)
|
37
|
+
server.start
|
38
|
+
end
|
39
|
+
|
40
|
+
# Wait for server to become ready on its new thread
|
41
|
+
sleep 0.01 while server.nil?
|
42
|
+
begin
|
43
|
+
action.call
|
44
|
+
ensure
|
45
|
+
server.shutdown
|
46
|
+
server_thread.exit
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
30
50
|
|
31
|
-
it "should raise error if file does not exists" do
|
32
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
|
33
|
-
end
|
34
51
|
|
35
|
-
it
|
36
|
-
|
52
|
+
it 'should raise error if file does not exist' do
|
53
|
+
expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
|
37
54
|
end
|
38
55
|
|
39
|
-
it
|
40
|
-
|
41
|
-
|
56
|
+
it 'should raise error if URL does not exist' do
|
57
|
+
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
58
|
+
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
59
|
+
expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
|
42
60
|
end
|
43
61
|
|
44
|
-
it
|
45
|
-
|
62
|
+
it 'should detect file type without a file extension' do
|
63
|
+
parser = Rika::Parser.new(file_path('text_file_without_extension'))
|
64
|
+
expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
|
46
65
|
end
|
47
66
|
|
48
67
|
describe '#content' do
|
49
|
-
it
|
50
|
-
|
68
|
+
it 'should return the content in a text file' do
|
69
|
+
expect(first_line.(txt_parser.content)).to eq(quote_first_line)
|
51
70
|
end
|
52
71
|
|
53
|
-
it
|
54
|
-
|
72
|
+
it 'should return the content in a docx file' do
|
73
|
+
expect(first_line.(docx_parser.content)).to eq(quote_first_line)
|
55
74
|
end
|
56
75
|
|
57
|
-
it
|
58
|
-
|
76
|
+
it 'should return the content in a pdf file' do
|
77
|
+
expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
|
59
78
|
end
|
60
79
|
|
61
|
-
it
|
62
|
-
|
80
|
+
it 'should return no content for an image' do
|
81
|
+
expect(image_parser.metadata.keys).to_not be_empty
|
63
82
|
end
|
64
83
|
|
65
|
-
it
|
66
|
-
|
67
|
-
parser.content.should == "First"
|
84
|
+
it 'should only return max content length' do
|
85
|
+
expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
|
68
86
|
end
|
69
87
|
|
70
|
-
it
|
71
|
-
|
72
|
-
|
88
|
+
it 'should only return max content length for file over http', focus: true do
|
89
|
+
server_runner.call( -> do
|
90
|
+
expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
|
91
|
+
end)
|
73
92
|
end
|
74
93
|
|
75
|
-
it
|
76
|
-
|
77
|
-
|
94
|
+
it 'should return the content from a file over http' do
|
95
|
+
server_runner.call( -> do
|
96
|
+
content = Rika::Parser.new(File.join(url, 'document.pdf')).content
|
97
|
+
expect(first_line.(content)).to eq(quote_first_line)
|
98
|
+
end)
|
78
99
|
end
|
79
100
|
|
80
|
-
it
|
81
|
-
|
82
|
-
parser.content.should == @quote
|
83
|
-
end
|
84
|
-
|
85
|
-
it "should return empty string for unknown file" do
|
86
|
-
@unknown_parser.content.should be_empty
|
101
|
+
it 'should return empty string for unknown file' do
|
102
|
+
expect(unknown_parser.content).to be_empty
|
87
103
|
end
|
88
104
|
end
|
89
105
|
|
90
|
-
# We just test a few of the metadata fields for some common file formats
|
91
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
106
|
+
# We just test a few of the metadata fields for some common file formats
|
107
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
92
108
|
# have tests for all file formats it supports so we won't retest that
|
93
109
|
describe '#metadata' do
|
94
|
-
it
|
95
|
-
|
110
|
+
it 'should return nil if metadata field does not exist' do
|
111
|
+
expect(txt_parser.metadata['nonsense']).to be_nil
|
96
112
|
end
|
97
113
|
|
98
|
-
it
|
99
|
-
|
114
|
+
it 'should return metadata from a docx file' do
|
115
|
+
expect(docx_parser.metadata['Page-Count']).to eq('1')
|
100
116
|
end
|
101
117
|
|
102
|
-
it
|
103
|
-
|
118
|
+
it 'should return metadata from a pdf file' do
|
119
|
+
expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
|
104
120
|
end
|
105
121
|
|
106
|
-
it
|
107
|
-
|
108
|
-
|
122
|
+
it 'should return metadata from a file over http', focus: true do
|
123
|
+
server_runner.call( -> do
|
124
|
+
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
125
|
+
expect(parser.metadata['Author']).to eq('Robert Frost')
|
126
|
+
end)
|
109
127
|
end
|
110
128
|
|
111
|
-
it
|
112
|
-
|
113
|
-
|
129
|
+
it 'should return metadata from an image' do
|
130
|
+
expect(image_parser.metadata['Image Height']).to eq('72 pixels')
|
131
|
+
expect(image_parser.metadata['Image Width']).to eq('72 pixels')
|
114
132
|
end
|
115
133
|
end
|
116
134
|
|
117
135
|
describe '#available_metadata' do
|
118
|
-
it
|
119
|
-
|
136
|
+
it 'should return available metadata fields' do
|
137
|
+
expect(txt_parser.available_metadata).to_not be_empty
|
120
138
|
end
|
121
139
|
|
122
|
-
it
|
123
|
-
|
140
|
+
it 'should be an array' do
|
141
|
+
expect(txt_parser.available_metadata).to be_an(Array)
|
124
142
|
end
|
125
143
|
end
|
126
144
|
|
127
145
|
describe '#metadata_exists?' do
|
128
|
-
it
|
129
|
-
|
146
|
+
it 'should return false if metadata does not exist' do
|
147
|
+
expect(txt_parser.metadata_exists?('title')).to be false
|
130
148
|
end
|
131
149
|
|
132
|
-
it
|
133
|
-
|
150
|
+
it 'should return true if metadata exist' do
|
151
|
+
expect(docx_parser.metadata_exists?('title')).to be true
|
134
152
|
end
|
135
153
|
end
|
136
154
|
|
137
155
|
describe '#media_type' do
|
138
|
-
it
|
139
|
-
|
156
|
+
it 'should return application/pdf for a pdf file' do
|
157
|
+
expect(pdf_parser.media_type).to eq('application/pdf')
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'should return text/plain for a txt file' do
|
161
|
+
expect(txt_parser.media_type).to eq('text/plain')
|
140
162
|
end
|
141
163
|
|
142
|
-
it
|
143
|
-
|
164
|
+
it 'should return application/pdf for a pdf over http' do
|
165
|
+
server_runner.call( -> do
|
166
|
+
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
167
|
+
expect(parser.media_type).to eq('application/pdf')
|
168
|
+
end)
|
144
169
|
end
|
145
170
|
|
146
|
-
it
|
147
|
-
|
148
|
-
parser.media_type.should == "application/pdf"
|
171
|
+
it 'should return application/octet-stream for unknown file' do
|
172
|
+
expect(unknown_parser.media_type).to eq('application/octet-stream')
|
149
173
|
end
|
150
174
|
|
151
|
-
it
|
152
|
-
|
175
|
+
it 'should return msword for a doc file' do
|
176
|
+
expect(doc_parser.media_type).to eq('application/msword')
|
177
|
+
end
|
178
|
+
|
179
|
+
it 'should return wordprocessingml for a docx file' do
|
180
|
+
expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
153
181
|
end
|
154
182
|
end
|
155
183
|
|
156
184
|
describe '#language' do
|
157
|
-
it
|
158
|
-
|
159
|
-
["en", "de", "fr", "ru", "es"].each do |lang|
|
185
|
+
it 'should return the language of the content' do
|
186
|
+
%w(en de fr ru es).each do |lang|
|
160
187
|
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
161
|
-
txt.language.
|
188
|
+
expect(txt.language).to eq(lang)
|
162
189
|
end
|
163
190
|
end
|
164
191
|
end
|
165
192
|
|
193
|
+
# See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
|
166
194
|
describe '#language_is_reasonably_certain?' do
|
167
195
|
it "should return false if lang can't be determined" do
|
168
196
|
lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
|
@@ -175,19 +203,43 @@ describe Rika::Parser do
|
|
175
203
|
end
|
176
204
|
end
|
177
205
|
|
178
|
-
it
|
179
|
-
content = Rika.parse_content(
|
180
|
-
(content.
|
206
|
+
it 'should return valid content using Rika.parse_content' do
|
207
|
+
content = Rika.parse_content(sample_pdf_filespec)
|
208
|
+
expect(content).to be_a(String)
|
209
|
+
expect(content).to_not be_empty
|
181
210
|
end
|
182
211
|
|
183
|
-
it
|
184
|
-
metadata = Rika.parse_metadata(
|
185
|
-
(metadata.
|
212
|
+
it 'should return valid metadata using Rika.parse_metadata' do
|
213
|
+
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
214
|
+
expect(metadata).to be_a(Hash)
|
215
|
+
expect(metadata).to_not be_empty
|
186
216
|
end
|
187
217
|
|
188
|
-
it
|
189
|
-
content, metadata = Rika.parse_content_and_metadata(
|
190
|
-
(content.
|
191
|
-
|
218
|
+
it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
|
219
|
+
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
220
|
+
expect(content).to be_a(String)
|
221
|
+
expect(content).to_not be_empty
|
222
|
+
expect(metadata).to be_a(Hash)
|
223
|
+
expect(metadata).to_not be_empty
|
224
|
+
end
|
225
|
+
|
226
|
+
specify 'both means of getting both content and metadata should return the same values' do
|
227
|
+
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
228
|
+
|
229
|
+
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
230
|
+
content_2 = h[:content]
|
231
|
+
metadata_2 = h[:metadata]
|
232
|
+
|
233
|
+
expect(content_1).to eq(content_2)
|
234
|
+
expect(metadata_1).to eq(metadata_2)
|
235
|
+
end
|
236
|
+
|
237
|
+
specify 'getting content and metadata individually and together should return the same values' do
|
238
|
+
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
|
239
|
+
content_2 = Rika.parse_content(sample_pdf_filespec)
|
240
|
+
metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
|
241
|
+
|
242
|
+
expect(content_1).to eq(content_2)
|
243
|
+
expect(metadata_1).to eq(metadata_2)
|
192
244
|
end
|
193
245
|
end
|