rika 1.6.0-java → 1.11.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.travis.yml +3 -3
- data/README.md +82 -40
- data/RELEASE_NOTES.md +17 -0
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/lib/rika.rb +17 -96
- data/lib/rika/parser.rb +90 -0
- data/lib/rika/version.rb +1 -1
- data/pom.xml +2 -2
- data/rika.gemspec +9 -7
- data/rika_helper.rb +38 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/text_file.txt +23 -1
- data/spec/fixtures/text_file_without_extension +23 -1
- data/spec/rika_spec.rb +145 -102
- data/spec/spec_helper.rb +4 -3
- metadata +31 -66
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-debug-all-4.1.jar +0 -0
- data/target/dependency/aspectjrt-1.8.0.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.9.jar +0 -0
- data/target/dependency/commons-compress-1.8.1.jar +0 -0
- data/target/dependency/commons-httpclient-3.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/fontbox-1.8.6.jar +0 -0
- data/target/dependency/isoparser-1.0.2.jar +0 -0
- data/target/dependency/java-libpst-0.8.1.jar +0 -0
- data/target/dependency/jcip-annotations-1.0.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.8.6.jar +0 -0
- data/target/dependency/jhighlight-1.0.jar +0 -0
- data/target/dependency/jmatio-1.0.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2.20.jar +0 -0
- data/target/dependency/pdfbox-1.8.6.jar +0 -0
- data/target/dependency/poi-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
- data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
- data/target/dependency/rome-1.0.jar +0 -0
- data/target/dependency/slf4j-api-1.6.1.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.6.jar +0 -0
- data/target/dependency/tika-parsers-1.6.jar +0 -0
- data/target/dependency/unidataCommon-4.2.20.jar +0 -0
- data/target/dependency/vorbis-java-core-0.6.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.6.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.5.jar +0 -0
data/rika.gemspec
CHANGED
|
@@ -6,16 +6,18 @@ require 'rika/version'
|
|
|
6
6
|
Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "rika"
|
|
8
8
|
gem.version = Rika::VERSION
|
|
9
|
-
gem.authors = ["Richard Nyström"]
|
|
10
|
-
gem.email = ["ricny046@gmail.com"]
|
|
11
|
-
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various
|
|
12
|
-
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various
|
|
13
|
-
gem.homepage = "https://github.com/
|
|
9
|
+
gem.authors = ["Richard Nyström", "Keith Bennett"]
|
|
10
|
+
gem.email = ["ricny046@gmail.com", "keithrbennett@gmail.com"]
|
|
11
|
+
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
|
|
12
|
+
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
|
|
13
|
+
gem.homepage = "https://github.com/keithrbennett/rika"
|
|
14
14
|
gem.files = `git ls-files`.split($/)
|
|
15
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
|
16
16
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
|
17
17
|
gem.require_paths = ["lib"]
|
|
18
|
-
gem.add_development_dependency "rspec", "
|
|
19
|
-
gem.add_development_dependency "rake", "
|
|
18
|
+
gem.add_development_dependency "rspec", "~> 3.9"
|
|
19
|
+
gem.add_development_dependency "rake", "~> 13.0"
|
|
20
20
|
gem.platform = "java"
|
|
21
|
+
gem.license = "Apache-2.0"
|
|
21
22
|
end
|
|
23
|
+
|
data/rika_helper.rb
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Defines some shortcuts for ad-hoc work with Rika.
|
|
2
|
+
#
|
|
3
|
+
# Can be used with the `irb`/`jirb` or `pry` (https://github.com/pry/pry) interactive shells:
|
|
4
|
+
# `pry -r rika_helper.rb`
|
|
5
|
+
#
|
|
6
|
+
# Can be used with the `rexe` command line executor (https://github.com/keithrbennett/rexe):
|
|
7
|
+
# rexe -r ./rika_helper.rb # e.g., add: `-oa 'm "x.pdf"'` to output metadata w/AwesomePrint
|
|
8
|
+
#
|
|
9
|
+
# or plain Ruby:
|
|
10
|
+
# ruby -r ./rika_helper -r awesome_print -e 'ap m("x.pdf")'
|
|
11
|
+
|
|
12
|
+
require 'rika'
|
|
13
|
+
|
|
14
|
+
def c(resource)
|
|
15
|
+
Rika.parse_content(resource)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def m(resource)
|
|
19
|
+
Rika.parse_metadata(resource)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def cm(resource)
|
|
23
|
+
Rika.parse_content_and_metadata(resource)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def cmh(resource)
|
|
27
|
+
Rika.parse_content_and_metadata_as_hash(resource)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def mj(resource); m(resource).to_json ; end
|
|
31
|
+
def mJ(resource); JSON.pretty_generate(m(resource)) ; end
|
|
32
|
+
def my(resource); m(resource).to_yaml ; end
|
|
33
|
+
def my(resource); require 'awesome_print'; m(resource).ai ;end
|
|
34
|
+
|
|
35
|
+
def cmj(resource); c(resource).to_json; end
|
|
36
|
+
def cmJ(resource); JSON.pretty_generate(c(resource)); end
|
|
37
|
+
def cmy(resource); c(resource).to_yaml ; end
|
|
38
|
+
def cma(resource); require 'awesome_print'; c,m = cm(resource); { content: c, metadata: m }; end
|
data/spec/fixtures/de.txt
CHANGED
|
@@ -1 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
Eines verschneiten Abends im Wald verhaltend (German)
|
|
2
|
+
|
|
3
|
+
Wohl weiß ich, wer den Wald besitzt,
|
|
4
|
+
Sein Haus jedoch im Dorf dort ist;
|
|
5
|
+
Er merkt nicht, wie ich hier verharr'
|
|
6
|
+
Zu schaun, wie Wald den Schnee begrüßt.
|
|
7
|
+
|
|
8
|
+
Mein Pferdchen denkt, ich wär ein Narr,
|
|
9
|
+
Zu halten, wo nicht Haus noch Farm,
|
|
10
|
+
Im Wald, nah beim gefrornen Teich,
|
|
11
|
+
Am finstersten Abend im Jahr.
|
|
12
|
+
|
|
13
|
+
Des Zaumzeugs Glöckchen klingen leis:
|
|
14
|
+
Es fragt: was ist? geht's weiter gleich?
|
|
15
|
+
Ansonsten hört man nur ganz sacht
|
|
16
|
+
Den Schneefall und den Wind, der streicht.
|
|
17
|
+
|
|
18
|
+
Der Wald ist nett, tief wie die Nacht,
|
|
19
|
+
Doch halte ich, was ich versprach,
|
|
20
|
+
Und Meilen sind's noch bis zum Schlaf,
|
|
21
|
+
Und Meilen sind's noch bis zum Schlaf.
|
data/spec/fixtures/document.doc
CHANGED
|
Binary file
|
data/spec/fixtures/document.docx
CHANGED
|
Binary file
|
data/spec/fixtures/document.pdf
CHANGED
|
Binary file
|
data/spec/fixtures/en.txt
CHANGED
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
Stopping by Woods on a Snowy Evening
|
|
2
|
+
|
|
3
|
+
By Robert Frost
|
|
4
|
+
|
|
5
|
+
Whose woods these are I think I know.
|
|
6
|
+
His house is in the village though;
|
|
7
|
+
He will not see me stopping here
|
|
8
|
+
To watch his woods fill up with snow.
|
|
9
|
+
|
|
10
|
+
My little horse must think it queer
|
|
11
|
+
To stop without a farmhouse near
|
|
12
|
+
Between the woods and frozen lake
|
|
13
|
+
The darkest evening of the year.
|
|
14
|
+
|
|
15
|
+
He gives his harness bells a shake
|
|
16
|
+
To ask if there is some mistake.
|
|
17
|
+
The only other sound’s the sweep
|
|
18
|
+
Of easy wind and downy flake.
|
|
19
|
+
|
|
20
|
+
The woods are lovely, dark and deep,
|
|
21
|
+
But I have promises to keep,
|
|
22
|
+
And miles to go before I sleep,
|
|
23
|
+
And miles to go before I sleep.
|
data/spec/fixtures/es.txt
CHANGED
|
@@ -1 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
Un alto en el bosque mientras nieva
|
|
2
|
+
|
|
3
|
+
De quién es este bosque, saber creo
|
|
4
|
+
-en el poblado su morada veo-
|
|
5
|
+
no habrá de sorprenderme contemplando
|
|
6
|
+
cubrir su bosque el invernal blanqueo.
|
|
7
|
+
|
|
8
|
+
Mi caballito se dirá extrañado
|
|
9
|
+
que, sin granja cercana, hemos parado
|
|
10
|
+
de este año en la tarde más oscura,
|
|
11
|
+
entre el bosque y el lago congelado.
|
|
12
|
+
|
|
13
|
+
Sacudiéndose, agita su cencerro
|
|
14
|
+
preguntando quizá: -¿será algún yerro?
|
|
15
|
+
Sólo el cierzo y los copos rumorean
|
|
16
|
+
blandamente del bosque en el encierro.
|
|
17
|
+
|
|
18
|
+
Yo, el bosque hondo y fusco veo risueño...
|
|
19
|
+
Mas, en cumplir promesas tengo empeño,
|
|
20
|
+
y millas debo andar antes del sueño,
|
|
21
|
+
un largo andar para llegar al sueño.
|
data/spec/fixtures/fr.txt
CHANGED
|
@@ -1 +1,23 @@
|
|
|
1
|
-
En
|
|
1
|
+
En s'arrêtant par les bois un soir de neige
|
|
2
|
+
|
|
3
|
+
Auteur: Robert Frost
|
|
4
|
+
|
|
5
|
+
À qui sont ces bois, je crois que je sais.
|
|
6
|
+
Sa maison est au village pourtant ;
|
|
7
|
+
Il ne me verra pas m'arrêter ici
|
|
8
|
+
À regarder ses bois recouverts de neige.
|
|
9
|
+
|
|
10
|
+
Mon petit cheval doit trouver cela étrange
|
|
11
|
+
De s'arrêter loin de toute ferme aux alentours
|
|
12
|
+
Entre ces bois et ce lac gelé
|
|
13
|
+
Au soir le plus sombre de l'année.
|
|
14
|
+
|
|
15
|
+
Il fait tinter les clochettes de son harnais en tirant dessus
|
|
16
|
+
Comme pour demander s'il n'y a pas là quelque erreur.
|
|
17
|
+
Les seuls autres bruits le souffle
|
|
18
|
+
D'une brise légère et le son duveteux des flocons.
|
|
19
|
+
|
|
20
|
+
Les bois sont beaux, sombres et profonds,
|
|
21
|
+
Mais j'ai des promesses à tenir
|
|
22
|
+
Et un long chemin à parcourir avant de dormir,
|
|
23
|
+
Et un long chemin à parcourir avant de dormir.
|
data/spec/fixtures/ru.txt
CHANGED
|
@@ -1 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
Остановившись у леса снежным вечером
|
|
2
|
+
|
|
3
|
+
Чей лес, мне кажется, я знаю:
|
|
4
|
+
в селе живет его хозяин.
|
|
5
|
+
Он не увидит, как на снежный
|
|
6
|
+
я лес его стою взираю.
|
|
7
|
+
|
|
8
|
+
В недоуменье конь, конечно,
|
|
9
|
+
зачем в ночи за год темнейшей
|
|
10
|
+
мы стали там, где нет жилья,
|
|
11
|
+
у леса с озером замерзшим.
|
|
12
|
+
|
|
13
|
+
Он, бубенцом слегка звеня,
|
|
14
|
+
как будто бы корит меня,
|
|
15
|
+
да веет слабый ветерок,
|
|
16
|
+
пушистым снегом шелестя.
|
|
17
|
+
|
|
18
|
+
Лес сладок, темен и глубок,
|
|
19
|
+
но в путь пора мне — долг есть долг.
|
|
20
|
+
И ехать долго — сон далек,
|
|
21
|
+
и ехать долго — сон далек.
|
data/spec/fixtures/text_file.txt
CHANGED
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
Stopping by Woods on a Snowy Evening
|
|
2
|
+
|
|
3
|
+
By Robert Frost
|
|
4
|
+
|
|
5
|
+
Whose woods these are I think I know.
|
|
6
|
+
His house is in the village though;
|
|
7
|
+
He will not see me stopping here
|
|
8
|
+
To watch his woods fill up with snow.
|
|
9
|
+
|
|
10
|
+
My little horse must think it queer
|
|
11
|
+
To stop without a farmhouse near
|
|
12
|
+
Between the woods and frozen lake
|
|
13
|
+
The darkest evening of the year.
|
|
14
|
+
|
|
15
|
+
He gives his harness bells a shake
|
|
16
|
+
To ask if there is some mistake.
|
|
17
|
+
The only other sound’s the sweep
|
|
18
|
+
Of easy wind and downy flake.
|
|
19
|
+
|
|
20
|
+
The woods are lovely, dark and deep,
|
|
21
|
+
But I have promises to keep,
|
|
22
|
+
And miles to go before I sleep,
|
|
23
|
+
And miles to go before I sleep.
|
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
Stopping by Woods on a Snowy Evening
|
|
2
|
+
|
|
3
|
+
By Robert Frost
|
|
4
|
+
|
|
5
|
+
Whose woods these are I think I know.
|
|
6
|
+
His house is in the village though;
|
|
7
|
+
He will not see me stopping here
|
|
8
|
+
To watch his woods fill up with snow.
|
|
9
|
+
|
|
10
|
+
My little horse must think it queer
|
|
11
|
+
To stop without a farmhouse near
|
|
12
|
+
Between the woods and frozen lake
|
|
13
|
+
The darkest evening of the year.
|
|
14
|
+
|
|
15
|
+
He gives his harness bells a shake
|
|
16
|
+
To ask if there is some mistake.
|
|
17
|
+
The only other sound’s the sweep
|
|
18
|
+
Of easy wind and downy flake.
|
|
19
|
+
|
|
20
|
+
The woods are lovely, dark and deep,
|
|
21
|
+
But I have promises to keep,
|
|
22
|
+
And miles to go before I sleep,
|
|
23
|
+
And miles to go before I sleep.
|
data/spec/rika_spec.rb
CHANGED
|
@@ -6,85 +6,100 @@ require 'webrick'
|
|
|
6
6
|
include WEBrick
|
|
7
7
|
|
|
8
8
|
describe Rika::Parser do
|
|
9
|
-
before(:all) do
|
|
10
|
-
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
|
11
|
-
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
|
12
|
-
@doc_parser = Rika::Parser.new(file_path("document.doc"))
|
|
13
|
-
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
|
14
|
-
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
|
15
|
-
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
|
16
|
-
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
|
17
|
-
port = 50515
|
|
18
|
-
@url = "http://#{Socket.gethostname}:#{port}"
|
|
19
|
-
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
|
20
|
-
@t1 = Thread.new do
|
|
21
|
-
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
|
22
|
-
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
|
23
|
-
@server.start
|
|
24
|
-
end
|
|
25
|
-
@sample_pdf_filespec = file_path("document.pdf")
|
|
26
|
-
end
|
|
27
9
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
10
|
+
let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
|
|
11
|
+
let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
|
|
12
|
+
let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
|
|
13
|
+
let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
|
|
14
|
+
let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
|
|
15
|
+
let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
|
|
16
|
+
let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
|
|
17
|
+
let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
|
31
18
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
19
|
+
port = 50515
|
|
20
|
+
let (:url) { "http://#{Socket.gethostname}:#{port}" }
|
|
21
|
+
|
|
22
|
+
let (:sample_pdf_filespec) { file_path('document.pdf') }
|
|
23
|
+
|
|
24
|
+
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
|
25
|
+
|
|
26
|
+
let(:server_runner) do
|
|
27
|
+
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
|
28
|
+
->(action) do
|
|
29
|
+
server = nil
|
|
30
|
+
server_thread = Thread.new do
|
|
31
|
+
server = HTTPServer.new(
|
|
32
|
+
Port: port,
|
|
33
|
+
DocumentRoot: dir,
|
|
34
|
+
AccessLog: [],
|
|
35
|
+
Logger: WEBrick::Log::new('/dev/null')
|
|
36
|
+
)
|
|
37
|
+
server.start
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Wait for server to become ready on its new thread
|
|
41
|
+
sleep 0.01 while server.nil?
|
|
42
|
+
begin
|
|
43
|
+
action.call
|
|
44
|
+
ensure
|
|
45
|
+
server.shutdown
|
|
46
|
+
server_thread.exit
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
35
50
|
|
|
36
|
-
|
|
37
|
-
|
|
51
|
+
|
|
52
|
+
it 'should raise error if file does not exist' do
|
|
53
|
+
expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
|
|
38
54
|
end
|
|
39
55
|
|
|
40
|
-
it
|
|
41
|
-
|
|
42
|
-
|
|
56
|
+
it 'should raise error if URL does not exist' do
|
|
57
|
+
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
|
58
|
+
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
|
59
|
+
expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
|
|
43
60
|
end
|
|
44
61
|
|
|
45
|
-
it
|
|
46
|
-
|
|
62
|
+
it 'should detect file type without a file extension' do
|
|
63
|
+
parser = Rika::Parser.new(file_path('text_file_without_extension'))
|
|
64
|
+
expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
|
|
47
65
|
end
|
|
48
66
|
|
|
49
67
|
describe '#content' do
|
|
50
|
-
it
|
|
51
|
-
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
it "should return the content in a docx file" do
|
|
55
|
-
@docx_parser.content.should == @quote
|
|
68
|
+
it 'should return the content in a text file' do
|
|
69
|
+
expect(first_line.(txt_parser.content)).to eq(quote_first_line)
|
|
56
70
|
end
|
|
57
71
|
|
|
58
|
-
it
|
|
59
|
-
|
|
72
|
+
it 'should return the content in a docx file' do
|
|
73
|
+
expect(first_line.(docx_parser.content)).to eq(quote_first_line)
|
|
60
74
|
end
|
|
61
75
|
|
|
62
|
-
it
|
|
63
|
-
|
|
76
|
+
it 'should return the content in a pdf file' do
|
|
77
|
+
expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
|
|
64
78
|
end
|
|
65
79
|
|
|
66
|
-
it
|
|
67
|
-
|
|
68
|
-
parser.content.should == "First"
|
|
80
|
+
it 'should return no content for an image' do
|
|
81
|
+
expect(image_parser.metadata.keys).to_not be_empty
|
|
69
82
|
end
|
|
70
83
|
|
|
71
|
-
it
|
|
72
|
-
|
|
73
|
-
parser.content.should == "First"
|
|
84
|
+
it 'should only return max content length' do
|
|
85
|
+
expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
|
|
74
86
|
end
|
|
75
87
|
|
|
76
|
-
it
|
|
77
|
-
|
|
78
|
-
|
|
88
|
+
it 'should only return max content length for file over http', focus: true do
|
|
89
|
+
server_runner.call( -> do
|
|
90
|
+
expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
|
|
91
|
+
end)
|
|
79
92
|
end
|
|
80
93
|
|
|
81
|
-
it
|
|
82
|
-
|
|
83
|
-
|
|
94
|
+
it 'should return the content from a file over http' do
|
|
95
|
+
server_runner.call( -> do
|
|
96
|
+
content = Rika::Parser.new(File.join(url, 'document.pdf')).content
|
|
97
|
+
expect(first_line.(content)).to eq(quote_first_line)
|
|
98
|
+
end)
|
|
84
99
|
end
|
|
85
100
|
|
|
86
|
-
it
|
|
87
|
-
|
|
101
|
+
it 'should return empty string for unknown file' do
|
|
102
|
+
expect(unknown_parser.content).to be_empty
|
|
88
103
|
end
|
|
89
104
|
end
|
|
90
105
|
|
|
@@ -92,86 +107,90 @@ describe Rika::Parser do
|
|
|
92
107
|
# to make sure the integration with Apache Tika works. Apache Tika already
|
|
93
108
|
# have tests for all file formats it supports so we won't retest that
|
|
94
109
|
describe '#metadata' do
|
|
95
|
-
it
|
|
96
|
-
|
|
110
|
+
it 'should return nil if metadata field does not exist' do
|
|
111
|
+
expect(txt_parser.metadata['nonsense']).to be_nil
|
|
97
112
|
end
|
|
98
113
|
|
|
99
|
-
it
|
|
100
|
-
|
|
114
|
+
it 'should return metadata from a docx file' do
|
|
115
|
+
expect(docx_parser.metadata['Page-Count']).to eq('1')
|
|
101
116
|
end
|
|
102
117
|
|
|
103
|
-
it
|
|
104
|
-
|
|
118
|
+
it 'should return metadata from a pdf file' do
|
|
119
|
+
expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
|
|
105
120
|
end
|
|
106
121
|
|
|
107
|
-
it
|
|
108
|
-
|
|
109
|
-
|
|
122
|
+
it 'should return metadata from a file over http', focus: true do
|
|
123
|
+
server_runner.call( -> do
|
|
124
|
+
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
|
125
|
+
expect(parser.metadata['Author']).to eq('Robert Frost')
|
|
126
|
+
end)
|
|
110
127
|
end
|
|
111
128
|
|
|
112
|
-
it
|
|
113
|
-
|
|
114
|
-
|
|
129
|
+
it 'should return metadata from an image' do
|
|
130
|
+
expect(image_parser.metadata['Image Height']).to eq('72 pixels')
|
|
131
|
+
expect(image_parser.metadata['Image Width']).to eq('72 pixels')
|
|
115
132
|
end
|
|
116
133
|
end
|
|
117
134
|
|
|
118
135
|
describe '#available_metadata' do
|
|
119
|
-
it
|
|
120
|
-
|
|
136
|
+
it 'should return available metadata fields' do
|
|
137
|
+
expect(txt_parser.available_metadata).to_not be_empty
|
|
121
138
|
end
|
|
122
139
|
|
|
123
|
-
it
|
|
124
|
-
|
|
140
|
+
it 'should be an array' do
|
|
141
|
+
expect(txt_parser.available_metadata).to be_an(Array)
|
|
125
142
|
end
|
|
126
143
|
end
|
|
127
144
|
|
|
128
145
|
describe '#metadata_exists?' do
|
|
129
|
-
it
|
|
130
|
-
|
|
146
|
+
it 'should return false if metadata does not exist' do
|
|
147
|
+
expect(txt_parser.metadata_exists?('title')).to be false
|
|
131
148
|
end
|
|
132
149
|
|
|
133
|
-
it
|
|
134
|
-
|
|
150
|
+
it 'should return true if metadata exist' do
|
|
151
|
+
expect(docx_parser.metadata_exists?('title')).to be true
|
|
135
152
|
end
|
|
136
153
|
end
|
|
137
154
|
|
|
138
155
|
describe '#media_type' do
|
|
139
|
-
it
|
|
140
|
-
|
|
156
|
+
it 'should return application/pdf for a pdf file' do
|
|
157
|
+
expect(pdf_parser.media_type).to eq('application/pdf')
|
|
141
158
|
end
|
|
142
159
|
|
|
143
|
-
it
|
|
144
|
-
|
|
160
|
+
it 'should return text/plain for a txt file' do
|
|
161
|
+
expect(txt_parser.media_type).to eq('text/plain')
|
|
145
162
|
end
|
|
146
163
|
|
|
147
|
-
it
|
|
148
|
-
|
|
149
|
-
|
|
164
|
+
it 'should return application/pdf for a pdf over http' do
|
|
165
|
+
server_runner.call( -> do
|
|
166
|
+
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
|
167
|
+
expect(parser.media_type).to eq('application/pdf')
|
|
168
|
+
end)
|
|
150
169
|
end
|
|
151
170
|
|
|
152
|
-
it
|
|
153
|
-
|
|
171
|
+
it 'should return application/octet-stream for unknown file' do
|
|
172
|
+
expect(unknown_parser.media_type).to eq('application/octet-stream')
|
|
154
173
|
end
|
|
155
174
|
|
|
156
|
-
it
|
|
157
|
-
|
|
175
|
+
it 'should return msword for a doc file' do
|
|
176
|
+
expect(doc_parser.media_type).to eq('application/msword')
|
|
158
177
|
end
|
|
159
178
|
|
|
160
|
-
it
|
|
161
|
-
|
|
179
|
+
it 'should return wordprocessingml for a docx file' do
|
|
180
|
+
expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
|
162
181
|
end
|
|
163
182
|
end
|
|
164
183
|
|
|
165
184
|
describe '#language' do
|
|
166
|
-
it
|
|
167
|
-
|
|
168
|
-
["en", "de", "fr", "ru", "es"].each do |lang|
|
|
185
|
+
it 'should return the language of the content' do
|
|
186
|
+
%w(en de fr ru es).each do |lang|
|
|
169
187
|
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
|
170
|
-
txt.language.
|
|
188
|
+
expect(txt.language).to eq(lang)
|
|
171
189
|
end
|
|
172
190
|
end
|
|
173
191
|
end
|
|
174
192
|
|
|
193
|
+
# See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
|
|
175
194
|
describe '#language_is_reasonably_certain?' do
|
|
176
195
|
it "should return false if lang can't be determined" do
|
|
177
196
|
lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
|
|
@@ -184,19 +203,43 @@ describe Rika::Parser do
|
|
|
184
203
|
end
|
|
185
204
|
end
|
|
186
205
|
|
|
187
|
-
it
|
|
188
|
-
content = Rika.parse_content(
|
|
189
|
-
(content.
|
|
206
|
+
it 'should return valid content using Rika.parse_content' do
|
|
207
|
+
content = Rika.parse_content(sample_pdf_filespec)
|
|
208
|
+
expect(content).to be_a(String)
|
|
209
|
+
expect(content).to_not be_empty
|
|
190
210
|
end
|
|
191
211
|
|
|
192
|
-
it
|
|
193
|
-
metadata = Rika.parse_metadata(
|
|
194
|
-
(metadata.
|
|
212
|
+
it 'should return valid metadata using Rika.parse_metadata' do
|
|
213
|
+
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
|
214
|
+
expect(metadata).to be_a(Hash)
|
|
215
|
+
expect(metadata).to_not be_empty
|
|
195
216
|
end
|
|
196
217
|
|
|
197
|
-
it
|
|
198
|
-
content, metadata = Rika.parse_content_and_metadata(
|
|
199
|
-
(content.
|
|
200
|
-
|
|
218
|
+
it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
|
|
219
|
+
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
220
|
+
expect(content).to be_a(String)
|
|
221
|
+
expect(content).to_not be_empty
|
|
222
|
+
expect(metadata).to be_a(Hash)
|
|
223
|
+
expect(metadata).to_not be_empty
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
specify 'both means of getting both content and metadata should return the same values' do
|
|
227
|
+
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
228
|
+
|
|
229
|
+
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
|
230
|
+
content_2 = h[:content]
|
|
231
|
+
metadata_2 = h[:metadata]
|
|
232
|
+
|
|
233
|
+
expect(content_1).to eq(content_2)
|
|
234
|
+
expect(metadata_1).to eq(metadata_2)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
specify 'getting content and metadata individually and together should return the same values' do
|
|
238
|
+
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
|
|
239
|
+
content_2 = Rika.parse_content(sample_pdf_filespec)
|
|
240
|
+
metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
|
|
241
|
+
|
|
242
|
+
expect(content_1).to eq(content_2)
|
|
243
|
+
expect(metadata_1).to eq(metadata_2)
|
|
201
244
|
end
|
|
202
245
|
end
|