rika 1.1.1-java → 1.11.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +3 -3
- data/README.md +82 -40
- data/RELEASE_NOTES.md +17 -0
- data/Rakefile +1 -1
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/lib/rika.rb +18 -93
- data/lib/rika/parser.rb +90 -0
- data/lib/rika/version.rb +1 -1
- data/pom.xml +4 -4
- data/rika.gemspec +9 -7
- data/rika_helper.rb +38 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/text_file.txt +23 -1
- data/spec/fixtures/text_file_without_extension +23 -1
- data/spec/rika_spec.rb +153 -101
- data/spec/spec_helper.rb +4 -3
- metadata +36 -76
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-3.1.jar +0 -0
- data/target/dependency/aspectjrt-1.6.11.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.5.jar +0 -0
- data/target/dependency/commons-compress-1.4.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/dom4j-1.6.1.jar +0 -0
- data/target/dependency/fontbox-1.7.1.jar +0 -0
- data/target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/target/dependency/isoparser-1.0-RC-1.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.7.1.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2-min.jar +0 -0
- data/target/dependency/pdfbox-1.7.1.jar +0 -0
- data/target/dependency/poi-3.8.jar +0 -0
- data/target/dependency/poi-ooxml-3.8.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.8.jar +0 -0
- data/target/dependency/poi-scratchpad-3.8.jar +0 -0
- data/target/dependency/rome-0.9.jar +0 -0
- data/target/dependency/slf4j-api-1.5.6.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.3.jar +0 -0
- data/target/dependency/tika-parsers-1.3.jar +0 -0
- data/target/dependency/vorbis-java-core-0.1-tests.jar +0 -0
- data/target/dependency/vorbis-java-core-0.1.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.1.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.3.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.0.jar +0 -0
data/pom.xml
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
<modelVersion>4.0.0</modelVersion>
|
|
4
4
|
|
|
5
5
|
<name>Rika</name>
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
<groupId>org.rika</groupId>
|
|
8
8
|
<artifactId>Rika</artifactId>
|
|
9
9
|
<version>1.0-SNAPSHOT</version>
|
|
@@ -12,9 +12,9 @@
|
|
|
12
12
|
<dependencies>
|
|
13
13
|
<dependency>
|
|
14
14
|
<groupId>org.apache.tika</groupId>
|
|
15
|
-
<artifactId>tika-
|
|
16
|
-
<version>1.
|
|
15
|
+
<artifactId>tika-app</artifactId>
|
|
16
|
+
<version>1.24</version>
|
|
17
17
|
<scope>test</scope>
|
|
18
18
|
</dependency>
|
|
19
19
|
</dependencies>
|
|
20
|
-
</project>
|
|
20
|
+
</project>
|
data/rika.gemspec
CHANGED
|
@@ -6,16 +6,18 @@ require 'rika/version'
|
|
|
6
6
|
Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "rika"
|
|
8
8
|
gem.version = Rika::VERSION
|
|
9
|
-
gem.authors = ["Richard Nyström"]
|
|
10
|
-
gem.email = ["ricny046@gmail.com"]
|
|
11
|
-
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various
|
|
12
|
-
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various
|
|
13
|
-
gem.homepage = "https://github.com/
|
|
9
|
+
gem.authors = ["Richard Nyström", "Keith Bennett"]
|
|
10
|
+
gem.email = ["ricny046@gmail.com", "keithrbennett@gmail.com"]
|
|
11
|
+
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
|
|
12
|
+
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
|
|
13
|
+
gem.homepage = "https://github.com/keithrbennett/rika"
|
|
14
14
|
gem.files = `git ls-files`.split($/)
|
|
15
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
|
16
16
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
|
17
17
|
gem.require_paths = ["lib"]
|
|
18
|
-
gem.add_development_dependency "rspec", "
|
|
19
|
-
gem.add_development_dependency "rake", "
|
|
18
|
+
gem.add_development_dependency "rspec", "~> 3.9"
|
|
19
|
+
gem.add_development_dependency "rake", "~> 13.0"
|
|
20
20
|
gem.platform = "java"
|
|
21
|
+
gem.license = "Apache-2.0"
|
|
21
22
|
end
|
|
23
|
+
|
data/rika_helper.rb
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Defines some shortcuts for ad-hoc work with Rika.
|
|
2
|
+
#
|
|
3
|
+
# Can be used with the `irb`/`jirb` or `pry` (https://github.com/pry/pry) interactive shells:
|
|
4
|
+
# `pry -r rika_helper.rb`
|
|
5
|
+
#
|
|
6
|
+
# Can be used with the `rexe` command line executor (https://github.com/keithrbennett/rexe):
|
|
7
|
+
# rexe -r ./rika_helper.rb # e.g., add: `-oa 'm "x.pdf"'` to output metadata w/AwesomePrint
|
|
8
|
+
#
|
|
9
|
+
# or plain Ruby:
|
|
10
|
+
# ruby -r ./rika_helper -r awesome_print -e 'ap m("x.pdf")'
|
|
11
|
+
|
|
12
|
+
require 'rika'
|
|
13
|
+
|
|
14
|
+
def c(resource)
|
|
15
|
+
Rika.parse_content(resource)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def m(resource)
|
|
19
|
+
Rika.parse_metadata(resource)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def cm(resource)
|
|
23
|
+
Rika.parse_content_and_metadata(resource)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def cmh(resource)
|
|
27
|
+
Rika.parse_content_and_metadata_as_hash(resource)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def mj(resource); m(resource).to_json ; end
|
|
31
|
+
def mJ(resource); JSON.pretty_generate(m(resource)) ; end
|
|
32
|
+
def my(resource); m(resource).to_yaml ; end
|
|
33
|
+
def my(resource); require 'awesome_print'; m(resource).ai ;end
|
|
34
|
+
|
|
35
|
+
def cmj(resource); c(resource).to_json; end
|
|
36
|
+
def cmJ(resource); JSON.pretty_generate(c(resource)); end
|
|
37
|
+
def cmy(resource); c(resource).to_yaml ; end
|
|
38
|
+
def cma(resource); require 'awesome_print'; c,m = cm(resource); { content: c, metadata: m }; end
|
data/spec/fixtures/de.txt
CHANGED
|
@@ -1 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
Eines verschneiten Abends im Wald verhaltend (German)
|
|
2
|
+
|
|
3
|
+
Wohl weiß ich, wer den Wald besitzt,
|
|
4
|
+
Sein Haus jedoch im Dorf dort ist;
|
|
5
|
+
Er merkt nicht, wie ich hier verharr'
|
|
6
|
+
Zu schaun, wie Wald den Schnee begrüßt.
|
|
7
|
+
|
|
8
|
+
Mein Pferdchen denkt, ich wär ein Narr,
|
|
9
|
+
Zu halten, wo nicht Haus noch Farm,
|
|
10
|
+
Im Wald, nah beim gefrornen Teich,
|
|
11
|
+
Am finstersten Abend im Jahr.
|
|
12
|
+
|
|
13
|
+
Des Zaumzeugs Glöckchen klingen leis:
|
|
14
|
+
Es fragt: was ist? geht's weiter gleich?
|
|
15
|
+
Ansonsten hört man nur ganz sacht
|
|
16
|
+
Den Schneefall und den Wind, der streicht.
|
|
17
|
+
|
|
18
|
+
Der Wald ist nett, tief wie die Nacht,
|
|
19
|
+
Doch halte ich, was ich versprach,
|
|
20
|
+
Und Meilen sind's noch bis zum Schlaf,
|
|
21
|
+
Und Meilen sind's noch bis zum Schlaf.
|
data/spec/fixtures/document.doc
CHANGED
|
Binary file
|
data/spec/fixtures/document.docx
CHANGED
|
Binary file
|
data/spec/fixtures/document.pdf
CHANGED
|
Binary file
|
data/spec/fixtures/en.txt
CHANGED
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
Stopping by Woods on a Snowy Evening
|
|
2
|
+
|
|
3
|
+
By Robert Frost
|
|
4
|
+
|
|
5
|
+
Whose woods these are I think I know.
|
|
6
|
+
His house is in the village though;
|
|
7
|
+
He will not see me stopping here
|
|
8
|
+
To watch his woods fill up with snow.
|
|
9
|
+
|
|
10
|
+
My little horse must think it queer
|
|
11
|
+
To stop without a farmhouse near
|
|
12
|
+
Between the woods and frozen lake
|
|
13
|
+
The darkest evening of the year.
|
|
14
|
+
|
|
15
|
+
He gives his harness bells a shake
|
|
16
|
+
To ask if there is some mistake.
|
|
17
|
+
The only other sound’s the sweep
|
|
18
|
+
Of easy wind and downy flake.
|
|
19
|
+
|
|
20
|
+
The woods are lovely, dark and deep,
|
|
21
|
+
But I have promises to keep,
|
|
22
|
+
And miles to go before I sleep,
|
|
23
|
+
And miles to go before I sleep.
|
data/spec/fixtures/es.txt
CHANGED
|
@@ -1 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
Un alto en el bosque mientras nieva
|
|
2
|
+
|
|
3
|
+
De quién es este bosque, saber creo
|
|
4
|
+
-en el poblado su morada veo-
|
|
5
|
+
no habrá de sorprenderme contemplando
|
|
6
|
+
cubrir su bosque el invernal blanqueo.
|
|
7
|
+
|
|
8
|
+
Mi caballito se dirá extrañado
|
|
9
|
+
que, sin granja cercana, hemos parado
|
|
10
|
+
de este año en la tarde más oscura,
|
|
11
|
+
entre el bosque y el lago congelado.
|
|
12
|
+
|
|
13
|
+
Sacudiéndose, agita su cencerro
|
|
14
|
+
preguntando quizá: -¿será algún yerro?
|
|
15
|
+
Sólo el cierzo y los copos rumorean
|
|
16
|
+
blandamente del bosque en el encierro.
|
|
17
|
+
|
|
18
|
+
Yo, el bosque hondo y fusco veo risueño...
|
|
19
|
+
Mas, en cumplir promesas tengo empeño,
|
|
20
|
+
y millas debo andar antes del sueño,
|
|
21
|
+
un largo andar para llegar al sueño.
|
data/spec/fixtures/fr.txt
CHANGED
|
@@ -1 +1,23 @@
|
|
|
1
|
-
En
|
|
1
|
+
En s'arrêtant par les bois un soir de neige
|
|
2
|
+
|
|
3
|
+
Auteur: Robert Frost
|
|
4
|
+
|
|
5
|
+
À qui sont ces bois, je crois que je sais.
|
|
6
|
+
Sa maison est au village pourtant ;
|
|
7
|
+
Il ne me verra pas m'arrêter ici
|
|
8
|
+
À regarder ses bois recouverts de neige.
|
|
9
|
+
|
|
10
|
+
Mon petit cheval doit trouver cela étrange
|
|
11
|
+
De s'arrêter loin de toute ferme aux alentours
|
|
12
|
+
Entre ces bois et ce lac gelé
|
|
13
|
+
Au soir le plus sombre de l'année.
|
|
14
|
+
|
|
15
|
+
Il fait tinter les clochettes de son harnais en tirant dessus
|
|
16
|
+
Comme pour demander s'il n'y a pas là quelque erreur.
|
|
17
|
+
Les seuls autres bruits le souffle
|
|
18
|
+
D'une brise légère et le son duveteux des flocons.
|
|
19
|
+
|
|
20
|
+
Les bois sont beaux, sombres et profonds,
|
|
21
|
+
Mais j'ai des promesses à tenir
|
|
22
|
+
Et un long chemin à parcourir avant de dormir,
|
|
23
|
+
Et un long chemin à parcourir avant de dormir.
|
data/spec/fixtures/ru.txt
CHANGED
|
@@ -1 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
Остановившись у леса снежным вечером
|
|
2
|
+
|
|
3
|
+
Чей лес, мне кажется, я знаю:
|
|
4
|
+
в селе живет его хозяин.
|
|
5
|
+
Он не увидит, как на снежный
|
|
6
|
+
я лес его стою взираю.
|
|
7
|
+
|
|
8
|
+
В недоуменье конь, конечно,
|
|
9
|
+
зачем в ночи за год темнейшей
|
|
10
|
+
мы стали там, где нет жилья,
|
|
11
|
+
у леса с озером замерзшим.
|
|
12
|
+
|
|
13
|
+
Он, бубенцом слегка звеня,
|
|
14
|
+
как будто бы корит меня,
|
|
15
|
+
да веет слабый ветерок,
|
|
16
|
+
пушистым снегом шелестя.
|
|
17
|
+
|
|
18
|
+
Лес сладок, темен и глубок,
|
|
19
|
+
но в путь пора мне — долг есть долг.
|
|
20
|
+
И ехать долго — сон далек,
|
|
21
|
+
и ехать долго — сон далек.
|
data/spec/fixtures/text_file.txt
CHANGED
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
Stopping by Woods on a Snowy Evening
|
|
2
|
+
|
|
3
|
+
By Robert Frost
|
|
4
|
+
|
|
5
|
+
Whose woods these are I think I know.
|
|
6
|
+
His house is in the village though;
|
|
7
|
+
He will not see me stopping here
|
|
8
|
+
To watch his woods fill up with snow.
|
|
9
|
+
|
|
10
|
+
My little horse must think it queer
|
|
11
|
+
To stop without a farmhouse near
|
|
12
|
+
Between the woods and frozen lake
|
|
13
|
+
The darkest evening of the year.
|
|
14
|
+
|
|
15
|
+
He gives his harness bells a shake
|
|
16
|
+
To ask if there is some mistake.
|
|
17
|
+
The only other sound’s the sweep
|
|
18
|
+
Of easy wind and downy flake.
|
|
19
|
+
|
|
20
|
+
The woods are lovely, dark and deep,
|
|
21
|
+
But I have promises to keep,
|
|
22
|
+
And miles to go before I sleep,
|
|
23
|
+
And miles to go before I sleep.
|
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
Stopping by Woods on a Snowy Evening
|
|
2
|
+
|
|
3
|
+
By Robert Frost
|
|
4
|
+
|
|
5
|
+
Whose woods these are I think I know.
|
|
6
|
+
His house is in the village though;
|
|
7
|
+
He will not see me stopping here
|
|
8
|
+
To watch his woods fill up with snow.
|
|
9
|
+
|
|
10
|
+
My little horse must think it queer
|
|
11
|
+
To stop without a farmhouse near
|
|
12
|
+
Between the woods and frozen lake
|
|
13
|
+
The darkest evening of the year.
|
|
14
|
+
|
|
15
|
+
He gives his harness bells a shake
|
|
16
|
+
To ask if there is some mistake.
|
|
17
|
+
The only other sound’s the sweep
|
|
18
|
+
Of easy wind and downy flake.
|
|
19
|
+
|
|
20
|
+
The woods are lovely, dark and deep,
|
|
21
|
+
But I have promises to keep,
|
|
22
|
+
And miles to go before I sleep,
|
|
23
|
+
And miles to go before I sleep.
|
data/spec/rika_spec.rb
CHANGED
|
@@ -4,165 +4,193 @@ require 'spec_helper'
|
|
|
4
4
|
require 'webrick'
|
|
5
5
|
|
|
6
6
|
include WEBrick
|
|
7
|
-
|
|
8
|
-
describe Rika::Parser do
|
|
9
|
-
before(:all) do
|
|
10
|
-
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
|
11
|
-
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
|
12
|
-
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
|
13
|
-
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
|
14
|
-
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
|
15
|
-
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
|
16
|
-
port = 50505
|
|
17
|
-
@url = "http://#{Socket.gethostname}:#{port}"
|
|
18
|
-
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
|
19
|
-
@t1 = Thread.new do
|
|
20
|
-
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
|
21
|
-
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
|
22
|
-
@server.start
|
|
23
|
-
end
|
|
24
|
-
@sample_pdf_filespec = file_path("document.pdf")
|
|
25
|
-
end
|
|
26
7
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
8
|
+
describe Rika::Parser do
|
|
9
|
+
|
|
10
|
+
let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
|
|
11
|
+
let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
|
|
12
|
+
let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
|
|
13
|
+
let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
|
|
14
|
+
let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
|
|
15
|
+
let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
|
|
16
|
+
let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
|
|
17
|
+
let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
|
18
|
+
|
|
19
|
+
port = 50515
|
|
20
|
+
let (:url) { "http://#{Socket.gethostname}:#{port}" }
|
|
21
|
+
|
|
22
|
+
let (:sample_pdf_filespec) { file_path('document.pdf') }
|
|
23
|
+
|
|
24
|
+
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
|
25
|
+
|
|
26
|
+
let(:server_runner) do
|
|
27
|
+
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
|
28
|
+
->(action) do
|
|
29
|
+
server = nil
|
|
30
|
+
server_thread = Thread.new do
|
|
31
|
+
server = HTTPServer.new(
|
|
32
|
+
Port: port,
|
|
33
|
+
DocumentRoot: dir,
|
|
34
|
+
AccessLog: [],
|
|
35
|
+
Logger: WEBrick::Log::new('/dev/null')
|
|
36
|
+
)
|
|
37
|
+
server.start
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Wait for server to become ready on its new thread
|
|
41
|
+
sleep 0.01 while server.nil?
|
|
42
|
+
begin
|
|
43
|
+
action.call
|
|
44
|
+
ensure
|
|
45
|
+
server.shutdown
|
|
46
|
+
server_thread.exit
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
30
50
|
|
|
31
|
-
it "should raise error if file does not exists" do
|
|
32
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
|
|
33
|
-
end
|
|
34
51
|
|
|
35
|
-
it
|
|
36
|
-
|
|
52
|
+
it 'should raise error if file does not exist' do
|
|
53
|
+
expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
|
|
37
54
|
end
|
|
38
55
|
|
|
39
|
-
it
|
|
40
|
-
|
|
41
|
-
|
|
56
|
+
it 'should raise error if URL does not exist' do
|
|
57
|
+
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
|
58
|
+
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
|
59
|
+
expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
|
|
42
60
|
end
|
|
43
61
|
|
|
44
|
-
it
|
|
45
|
-
|
|
62
|
+
it 'should detect file type without a file extension' do
|
|
63
|
+
parser = Rika::Parser.new(file_path('text_file_without_extension'))
|
|
64
|
+
expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
|
|
46
65
|
end
|
|
47
66
|
|
|
48
67
|
describe '#content' do
|
|
49
|
-
it
|
|
50
|
-
|
|
68
|
+
it 'should return the content in a text file' do
|
|
69
|
+
expect(first_line.(txt_parser.content)).to eq(quote_first_line)
|
|
51
70
|
end
|
|
52
71
|
|
|
53
|
-
it
|
|
54
|
-
|
|
72
|
+
it 'should return the content in a docx file' do
|
|
73
|
+
expect(first_line.(docx_parser.content)).to eq(quote_first_line)
|
|
55
74
|
end
|
|
56
75
|
|
|
57
|
-
it
|
|
58
|
-
|
|
76
|
+
it 'should return the content in a pdf file' do
|
|
77
|
+
expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
|
|
59
78
|
end
|
|
60
79
|
|
|
61
|
-
it
|
|
62
|
-
|
|
80
|
+
it 'should return no content for an image' do
|
|
81
|
+
expect(image_parser.metadata.keys).to_not be_empty
|
|
63
82
|
end
|
|
64
83
|
|
|
65
|
-
it
|
|
66
|
-
|
|
67
|
-
parser.content.should == "First"
|
|
84
|
+
it 'should only return max content length' do
|
|
85
|
+
expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
|
|
68
86
|
end
|
|
69
87
|
|
|
70
|
-
it
|
|
71
|
-
|
|
72
|
-
|
|
88
|
+
it 'should only return max content length for file over http', focus: true do
|
|
89
|
+
server_runner.call( -> do
|
|
90
|
+
expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
|
|
91
|
+
end)
|
|
73
92
|
end
|
|
74
93
|
|
|
75
|
-
it
|
|
76
|
-
|
|
77
|
-
|
|
94
|
+
it 'should return the content from a file over http' do
|
|
95
|
+
server_runner.call( -> do
|
|
96
|
+
content = Rika::Parser.new(File.join(url, 'document.pdf')).content
|
|
97
|
+
expect(first_line.(content)).to eq(quote_first_line)
|
|
98
|
+
end)
|
|
78
99
|
end
|
|
79
100
|
|
|
80
|
-
it
|
|
81
|
-
|
|
82
|
-
parser.content.should == @quote
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
it "should return empty string for unknown file" do
|
|
86
|
-
@unknown_parser.content.should be_empty
|
|
101
|
+
it 'should return empty string for unknown file' do
|
|
102
|
+
expect(unknown_parser.content).to be_empty
|
|
87
103
|
end
|
|
88
104
|
end
|
|
89
105
|
|
|
90
|
-
# We just test a few of the metadata fields for some common file formats
|
|
91
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
|
106
|
+
# We just test a few of the metadata fields for some common file formats
|
|
107
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
|
92
108
|
# have tests for all file formats it supports so we won't retest that
|
|
93
109
|
describe '#metadata' do
|
|
94
|
-
it
|
|
95
|
-
|
|
110
|
+
it 'should return nil if metadata field does not exist' do
|
|
111
|
+
expect(txt_parser.metadata['nonsense']).to be_nil
|
|
96
112
|
end
|
|
97
113
|
|
|
98
|
-
it
|
|
99
|
-
|
|
114
|
+
it 'should return metadata from a docx file' do
|
|
115
|
+
expect(docx_parser.metadata['Page-Count']).to eq('1')
|
|
100
116
|
end
|
|
101
117
|
|
|
102
|
-
it
|
|
103
|
-
|
|
118
|
+
it 'should return metadata from a pdf file' do
|
|
119
|
+
expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
|
|
104
120
|
end
|
|
105
121
|
|
|
106
|
-
it
|
|
107
|
-
|
|
108
|
-
|
|
122
|
+
it 'should return metadata from a file over http', focus: true do
|
|
123
|
+
server_runner.call( -> do
|
|
124
|
+
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
|
125
|
+
expect(parser.metadata['Author']).to eq('Robert Frost')
|
|
126
|
+
end)
|
|
109
127
|
end
|
|
110
128
|
|
|
111
|
-
it
|
|
112
|
-
|
|
113
|
-
|
|
129
|
+
it 'should return metadata from an image' do
|
|
130
|
+
expect(image_parser.metadata['Image Height']).to eq('72 pixels')
|
|
131
|
+
expect(image_parser.metadata['Image Width']).to eq('72 pixels')
|
|
114
132
|
end
|
|
115
133
|
end
|
|
116
134
|
|
|
117
135
|
describe '#available_metadata' do
|
|
118
|
-
it
|
|
119
|
-
|
|
136
|
+
it 'should return available metadata fields' do
|
|
137
|
+
expect(txt_parser.available_metadata).to_not be_empty
|
|
120
138
|
end
|
|
121
139
|
|
|
122
|
-
it
|
|
123
|
-
|
|
140
|
+
it 'should be an array' do
|
|
141
|
+
expect(txt_parser.available_metadata).to be_an(Array)
|
|
124
142
|
end
|
|
125
143
|
end
|
|
126
144
|
|
|
127
145
|
describe '#metadata_exists?' do
|
|
128
|
-
it
|
|
129
|
-
|
|
146
|
+
it 'should return false if metadata does not exist' do
|
|
147
|
+
expect(txt_parser.metadata_exists?('title')).to be false
|
|
130
148
|
end
|
|
131
149
|
|
|
132
|
-
it
|
|
133
|
-
|
|
150
|
+
it 'should return true if metadata exist' do
|
|
151
|
+
expect(docx_parser.metadata_exists?('title')).to be true
|
|
134
152
|
end
|
|
135
153
|
end
|
|
136
154
|
|
|
137
155
|
describe '#media_type' do
|
|
138
|
-
it
|
|
139
|
-
|
|
156
|
+
it 'should return application/pdf for a pdf file' do
|
|
157
|
+
expect(pdf_parser.media_type).to eq('application/pdf')
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it 'should return text/plain for a txt file' do
|
|
161
|
+
expect(txt_parser.media_type).to eq('text/plain')
|
|
140
162
|
end
|
|
141
163
|
|
|
142
|
-
it
|
|
143
|
-
|
|
164
|
+
it 'should return application/pdf for a pdf over http' do
|
|
165
|
+
server_runner.call( -> do
|
|
166
|
+
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
|
167
|
+
expect(parser.media_type).to eq('application/pdf')
|
|
168
|
+
end)
|
|
144
169
|
end
|
|
145
170
|
|
|
146
|
-
it
|
|
147
|
-
|
|
148
|
-
parser.media_type.should == "application/pdf"
|
|
171
|
+
it 'should return application/octet-stream for unknown file' do
|
|
172
|
+
expect(unknown_parser.media_type).to eq('application/octet-stream')
|
|
149
173
|
end
|
|
150
174
|
|
|
151
|
-
it
|
|
152
|
-
|
|
175
|
+
it 'should return msword for a doc file' do
|
|
176
|
+
expect(doc_parser.media_type).to eq('application/msword')
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
it 'should return wordprocessingml for a docx file' do
|
|
180
|
+
expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
|
153
181
|
end
|
|
154
182
|
end
|
|
155
183
|
|
|
156
184
|
describe '#language' do
|
|
157
|
-
it
|
|
158
|
-
|
|
159
|
-
["en", "de", "fr", "ru", "es"].each do |lang|
|
|
185
|
+
it 'should return the language of the content' do
|
|
186
|
+
%w(en de fr ru es).each do |lang|
|
|
160
187
|
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
|
161
|
-
txt.language.
|
|
188
|
+
expect(txt.language).to eq(lang)
|
|
162
189
|
end
|
|
163
190
|
end
|
|
164
191
|
end
|
|
165
192
|
|
|
193
|
+
# See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
|
|
166
194
|
describe '#language_is_reasonably_certain?' do
|
|
167
195
|
it "should return false if lang can't be determined" do
|
|
168
196
|
lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
|
|
@@ -175,19 +203,43 @@ describe Rika::Parser do
|
|
|
175
203
|
end
|
|
176
204
|
end
|
|
177
205
|
|
|
178
|
-
it
|
|
179
|
-
content = Rika.parse_content(
|
|
180
|
-
(content.
|
|
206
|
+
it 'should return valid content using Rika.parse_content' do
|
|
207
|
+
content = Rika.parse_content(sample_pdf_filespec)
|
|
208
|
+
expect(content).to be_a(String)
|
|
209
|
+
expect(content).to_not be_empty
|
|
181
210
|
end
|
|
182
211
|
|
|
183
|
-
it
|
|
184
|
-
metadata = Rika.parse_metadata(
|
|
185
|
-
(metadata.
|
|
212
|
+
it 'should return valid metadata using Rika.parse_metadata' do
|
|
213
|
+
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
|
214
|
+
expect(metadata).to be_a(Hash)
|
|
215
|
+
expect(metadata).to_not be_empty
|
|
186
216
|
end
|
|
187
217
|
|
|
188
|
-
it
|
|
189
|
-
content, metadata = Rika.parse_content_and_metadata(
|
|
190
|
-
(content.
|
|
191
|
-
|
|
218
|
+
it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
|
|
219
|
+
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
220
|
+
expect(content).to be_a(String)
|
|
221
|
+
expect(content).to_not be_empty
|
|
222
|
+
expect(metadata).to be_a(Hash)
|
|
223
|
+
expect(metadata).to_not be_empty
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
specify 'both means of getting both content and metadata should return the same values' do
|
|
227
|
+
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
228
|
+
|
|
229
|
+
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
|
230
|
+
content_2 = h[:content]
|
|
231
|
+
metadata_2 = h[:metadata]
|
|
232
|
+
|
|
233
|
+
expect(content_1).to eq(content_2)
|
|
234
|
+
expect(metadata_1).to eq(metadata_2)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
specify 'getting content and metadata individually and together should return the same values' do
|
|
238
|
+
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
|
|
239
|
+
content_2 = Rika.parse_content(sample_pdf_filespec)
|
|
240
|
+
metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
|
|
241
|
+
|
|
242
|
+
expect(content_1).to eq(content_2)
|
|
243
|
+
expect(metadata_1).to eq(metadata_2)
|
|
192
244
|
end
|
|
193
245
|
end
|