rika 1.1.1-java → 1.11.1-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.travis.yml +3 -3
  4. data/README.md +82 -40
  5. data/RELEASE_NOTES.md +17 -0
  6. data/Rakefile +1 -1
  7. data/java-lib/tika-app-1.24.1.jar +0 -0
  8. data/lib/rika.rb +18 -93
  9. data/lib/rika/parser.rb +90 -0
  10. data/lib/rika/version.rb +1 -1
  11. data/pom.xml +4 -4
  12. data/rika.gemspec +9 -7
  13. data/rika_helper.rb +38 -0
  14. data/spec/fixtures/de.txt +21 -1
  15. data/spec/fixtures/document.doc +0 -0
  16. data/spec/fixtures/document.docx +0 -0
  17. data/spec/fixtures/document.pdf +0 -0
  18. data/spec/fixtures/en.txt +23 -1
  19. data/spec/fixtures/es.txt +21 -1
  20. data/spec/fixtures/fr.txt +23 -1
  21. data/spec/fixtures/ru.txt +21 -1
  22. data/spec/fixtures/text_file.txt +23 -1
  23. data/spec/fixtures/text_file_without_extension +23 -1
  24. data/spec/rika_spec.rb +153 -101
  25. data/spec/spec_helper.rb +4 -3
  26. metadata +36 -76
  27. data/spec/fixtures/over_100k_file.txt +0 -1241
  28. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  29. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  30. data/target/dependency/asm-3.1.jar +0 -0
  31. data/target/dependency/aspectjrt-1.6.11.jar +0 -0
  32. data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
  33. data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
  34. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  35. data/target/dependency/commons-codec-1.5.jar +0 -0
  36. data/target/dependency/commons-compress-1.4.1.jar +0 -0
  37. data/target/dependency/commons-logging-1.1.1.jar +0 -0
  38. data/target/dependency/dom4j-1.6.1.jar +0 -0
  39. data/target/dependency/fontbox-1.7.1.jar +0 -0
  40. data/target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  41. data/target/dependency/isoparser-1.0-RC-1.jar +0 -0
  42. data/target/dependency/jdom-1.0.jar +0 -0
  43. data/target/dependency/jempbox-1.7.1.jar +0 -0
  44. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  45. data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
  46. data/target/dependency/netcdf-4.2-min.jar +0 -0
  47. data/target/dependency/pdfbox-1.7.1.jar +0 -0
  48. data/target/dependency/poi-3.8.jar +0 -0
  49. data/target/dependency/poi-ooxml-3.8.jar +0 -0
  50. data/target/dependency/poi-ooxml-schemas-3.8.jar +0 -0
  51. data/target/dependency/poi-scratchpad-3.8.jar +0 -0
  52. data/target/dependency/rome-0.9.jar +0 -0
  53. data/target/dependency/slf4j-api-1.5.6.jar +0 -0
  54. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  55. data/target/dependency/tika-core-1.3.jar +0 -0
  56. data/target/dependency/tika-parsers-1.3.jar +0 -0
  57. data/target/dependency/vorbis-java-core-0.1-tests.jar +0 -0
  58. data/target/dependency/vorbis-java-core-0.1.jar +0 -0
  59. data/target/dependency/vorbis-java-tika-0.1.jar +0 -0
  60. data/target/dependency/xercesImpl-2.8.1.jar +0 -0
  61. data/target/dependency/xml-apis-1.3.03.jar +0 -0
  62. data/target/dependency/xmlbeans-2.3.0.jar +0 -0
  63. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  64. data/target/dependency/xz-1.0.jar +0 -0
data/pom.xml CHANGED
@@ -3,7 +3,7 @@
3
3
  <modelVersion>4.0.0</modelVersion>
4
4
 
5
5
  <name>Rika</name>
6
-
6
+
7
7
  <groupId>org.rika</groupId>
8
8
  <artifactId>Rika</artifactId>
9
9
  <version>1.0-SNAPSHOT</version>
@@ -12,9 +12,9 @@
12
12
  <dependencies>
13
13
  <dependency>
14
14
  <groupId>org.apache.tika</groupId>
15
- <artifactId>tika-parsers</artifactId>
16
- <version>1.3</version>
15
+ <artifactId>tika-app</artifactId>
16
+ <version>1.24</version>
17
17
  <scope>test</scope>
18
18
  </dependency>
19
19
  </dependencies>
20
- </project>
20
+ </project>
@@ -6,16 +6,18 @@ require 'rika/version'
6
6
  Gem::Specification.new do |gem|
7
7
  gem.name = "rika"
8
8
  gem.version = Rika::VERSION
9
- gem.authors = ["Richard Nyström"]
10
- gem.email = ["ricny046@gmail.com"]
11
- gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats. }
12
- gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats. }
13
- gem.homepage = "https://github.com/ricn/rika"
9
+ gem.authors = ["Richard Nyström", "Keith Bennett"]
10
+ gem.email = ["ricny046@gmail.com", "keithrbennett@gmail.com"]
11
+ gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
12
+ gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
13
+ gem.homepage = "https://github.com/keithrbennett/rika"
14
14
  gem.files = `git ls-files`.split($/)
15
15
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
16
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
17
  gem.require_paths = ["lib"]
18
- gem.add_development_dependency "rspec", "2.12.0"
19
- gem.add_development_dependency "rake", "10.0.3"
18
+ gem.add_development_dependency "rspec", "~> 3.9"
19
+ gem.add_development_dependency "rake", "~> 13.0"
20
20
  gem.platform = "java"
21
+ gem.license = "Apache-2.0"
21
22
  end
23
+
@@ -0,0 +1,38 @@
1
+ # Defines some shortcuts for ad-hoc work with Rika.
2
+ #
3
+ # Can be used with the `irb`/`jirb` or `pry` (https://github.com/pry/pry) interactive shells:
4
+ # `pry -r rika_helper.rb`
5
+ #
6
+ # Can be used with the `rexe` command line executor (https://github.com/keithrbennett/rexe):
7
+ # rexe -r ./rika_helper.rb # e.g., add: `-oa 'm "x.pdf"'` to output metadata w/AwesomePrint
8
+ #
9
+ # or plain Ruby:
10
+ # ruby -r ./rika_helper -r awesome_print -e 'ap m("x.pdf")'
11
+
12
+ require 'rika'
13
+
14
+ def c(resource)
15
+ Rika.parse_content(resource)
16
+ end
17
+
18
+ def m(resource)
19
+ Rika.parse_metadata(resource)
20
+ end
21
+
22
+ def cm(resource)
23
+ Rika.parse_content_and_metadata(resource)
24
+ end
25
+
26
+ def cmh(resource)
27
+ Rika.parse_content_and_metadata_as_hash(resource)
28
+ end
29
+
30
+ def mj(resource); m(resource).to_json ; end
31
+ def mJ(resource); JSON.pretty_generate(m(resource)) ; end
32
+ def my(resource); m(resource).to_yaml ; end
33
+ def my(resource); require 'awesome_print'; m(resource).ai ;end
34
+
35
+ def cmj(resource); c(resource).to_json; end
36
+ def cmJ(resource); JSON.pretty_generate(c(resource)); end
37
+ def cmy(resource); c(resource).to_yaml ; end
38
+ def cma(resource); require 'awesome_print'; c,m = cm(resource); { content: c, metadata: m }; end
@@ -1 +1,21 @@
1
- Er hörte leise Schritte hinter sich. Das bedeutete nichts Gutes. Wer würde ihm schon folgen, spät in der Nacht und dazu noch in dieser engen Gasse mitten im übel beleumundeten Hafenviertel? Gerade jetzt, wo er das Ding seines Lebens gedreht hatte und mit der Beute verschwinden wollte! Hatte einer seiner zahllosen Kollegen dieselbe Idee gehabt, ihn beobachtet und abgewartet, um ihn nun um die Früchte seiner Arbeit zu erleichtern? Oder gehörten die Schritte hinter ihm zu einem der unzähligen Gesetzeshüter dieser Stadt, und die stählerne Acht um seine Handgelenke würde gleich zuschnappen? Er konnte die Aufforderung stehen zu bleiben schon hören. Gehetzt sah er sich um. Plötzlich erblickte er den schmalen Durchgang. Blitzartig drehte er sich nach rechts und verschwand zwischen den beiden Gebäuden. Beinahe wäre er dabei über den umgestürzten Mülleimer gefallen, der mitten im Weg lag. Er versuchte, sich in der Dunkelheit seinen Weg zu ertasten und erstarrte: Anscheinend gab es keinen anderen Ausweg aus diesem kleinen Hof als den Durchgang, durch den er gekommen war. Die Schritte wurden lauter und lauter, er sah eine dunkle Gestalt um die Ecke biegen. Fieberhaft irrten seine Augen durch die nächtliche Dunkelheit und suchten einen Ausweg. War jetzt wirklich alles vorbei,
1
+ Eines verschneiten Abends im Wald verhaltend (German)
2
+
3
+ Wohl weiß ich, wer den Wald besitzt,
4
+ Sein Haus jedoch im Dorf dort ist;
5
+ Er merkt nicht, wie ich hier verharr'
6
+ Zu schaun, wie Wald den Schnee begrüßt.
7
+
8
+ Mein Pferdchen denkt, ich wär ein Narr,
9
+ Zu halten, wo nicht Haus noch Farm,
10
+ Im Wald, nah beim gefrornen Teich,
11
+ Am finstersten Abend im Jahr.
12
+
13
+ Des Zaumzeugs Glöckchen klingen leis:
14
+ Es fragt: was ist? geht's weiter gleich?
15
+ Ansonsten hört man nur ganz sacht
16
+ Den Schneefall und den Wind, der streicht.
17
+
18
+ Der Wald ist nett, tief wie die Nacht,
19
+ Doch halte ich, was ich versprach,
20
+ Und Meilen sind's noch bis zum Schlaf,
21
+ Und Meilen sind's noch bis zum Schlaf.
Binary file
Binary file
Binary file
@@ -1 +1,23 @@
1
- Far far away, behind the word mountains, far from the countries Vokalia and Consonantia, there live the blind texts. Separated they live in Bookmarksgrove right at the coast of the Semantics, a large language ocean. A small river named Duden flows by their place and supplies it with the necessary regelialia. It is a paradisematic country, in which roasted parts of sentences fly into your mouth. Even the all-powerful Pointing has no control about the blind texts it is an almost unorthographic life One day however a small line of blind text by the name of Lorem Ipsum decided to leave for the far World of Grammar. The Big Oxmox advised her not to do so, because there were thousands of bad Commas, wild Question Marks and devious Semikoli, but the Little Blind Text didn’t listen. She packed her seven versalia, put her initial into the belt and made herself on the way. When she reached the first hills of the Italic Mountains, she had a last view back on the skyline of her hometown Bookmarksgrove, the headline of Alphabet Village and the subline of her own road, the Line Lane. Pityful a rethoric question ran over her cheek, then
1
+ Stopping by Woods on a Snowy Evening
2
+
3
+ By Robert Frost
4
+
5
+ Whose woods these are I think I know.
6
+ His house is in the village though;
7
+ He will not see me stopping here
8
+ To watch his woods fill up with snow.
9
+
10
+ My little horse must think it queer
11
+ To stop without a farmhouse near
12
+ Between the woods and frozen lake
13
+ The darkest evening of the year.
14
+
15
+ He gives his harness bells a shake
16
+ To ask if there is some mistake.
17
+ The only other sound’s the sweep
18
+ Of easy wind and downy flake.
19
+
20
+ The woods are lovely, dark and deep,
21
+ But I have promises to keep,
22
+ And miles to go before I sleep,
23
+ And miles to go before I sleep.
@@ -1 +1,21 @@
1
- Una mañana, tras un sueño intranquilo, Gregorio Samsa se despertó convertido en un monstruoso insecto. Estaba echado de espaldas sobre un duro caparazón y, al alzar la cabeza, vio su vientre convexo y oscuro, surcado por curvadas callosidades, sobre el que casi no se aguantaba la colcha, que estaba a punto de escurrirse hasta el suelo. Numerosas patas, penosamente delgadas en comparación con el grosor normal de sus piernas, se agitaban sin concierto. - ¿Qué me ha ocurrido? No estaba soñando. Su habitación, una habitación normal, aunque muy pequeña, tenía el aspecto habitual. Sobre la mesa había desparramado un muestrario de paños - Samsa era viajante de comercio-, y de la pared colgaba una estampa recientemente recortada de una revista ilustrada y puesta en un marco dorado. La estampa mostraba a una mujer tocada con un gorro de pieles, envuelta en una estola también de pieles, y que, muy erguida, esgrimía un amplio manguito, asimismo de piel, que ocultaba todo su antebrazo. Gregorio miró hacia la ventana; estaba nublado, y sobre el cinc del alféizar repiqueteaban las gotas de lluvia, lo que le hizo sentir una gran melancolía. «Bueno -pensó-; ¿y si siguiese durmiendo un rato y me olvidase de
1
+ Un alto en el bosque mientras nieva
2
+
3
+ De quién es este bosque, saber creo
4
+ -en el poblado su morada veo-
5
+ no habrá de sorprenderme contemplando
6
+ cubrir su bosque el invernal blanqueo.
7
+
8
+ Mi caballito se dirá extrañado
9
+ que, sin granja cercana, hemos parado
10
+ de este año en la tarde más oscura,
11
+ entre el bosque y el lago congelado.
12
+
13
+ Sacudiéndose, agita su cencerro
14
+ preguntando quizá: -¿será algún yerro?
15
+ Sólo el cierzo y los copos rumorean
16
+ blandamente del bosque en el encierro.
17
+
18
+ Yo, el bosque hondo y fusco veo risueño...
19
+ Mas, en cumplir promesas tengo empeño,
20
+ y millas debo andar antes del sueño,
21
+ un largo andar para llegar al sueño.
@@ -1 +1,23 @@
1
- En se réveillant un matin après des rêves agités, Gregor Samsa se retrouva, dans son lit, métamorphosé en un monstrueux insecte. Il était sur le dos, un dos aussi dur qu’une carapace, et, en relevant un peu la tête, il vit, bombé, brun, cloisonné par des arceaux plus rigides, son abdomen sur le haut duquel la couverture, prête à glisser tout à fait, ne tenait plus qu’à peine. Ses nombreuses pattes, lamentablement grêles par comparaison avec la corpulence qu’il avait par ailleurs, grouillaient désespérément sous ses yeux.« Qu’est-ce qui m’est arrivé ? » pensa-t-il. Ce n’était pas un rêve. Sa chambre, une vraie chambre humaine, juste un peu trop petite, était là tranquille entre les quatre murs qu’il connaissait bien. Au-dessus de la table où était déballée une collection d’échantillons de tissus - Samsa était représentant de commerce - on voyait accrochée l’image qu’il avait récemment découpée dans un magazine et mise dans un joli cadre doré. Elle représentait une dame munie d’une toque et d’un boa tous les deux en fourrure et qui, assise bien droite, tendait vers le spectateur un lourd manchon de fourrure où tout son avant-bras avait disparu. Le regard de Gregor se tourna ensuite vers
1
+ En s'arrêtant par les bois un soir de neige
2
+
3
+ Auteur: Robert Frost
4
+
5
+ À qui sont ces bois, je crois que je sais.
6
+ Sa maison est au village pourtant ;
7
+ Il ne me verra pas m'arrêter ici
8
+ À regarder ses bois recouverts de neige.
9
+
10
+ Mon petit cheval doit trouver cela étrange
11
+ De s'arrêter loin de toute ferme aux alentours
12
+ Entre ces bois et ce lac gelé
13
+ Au soir le plus sombre de l'année.
14
+
15
+ Il fait tinter les clochettes de son harnais en tirant dessus
16
+ Comme pour demander s'il n'y a pas là quelque erreur.
17
+ Les seuls autres bruits le souffle
18
+ D'une brise légère et le son duveteux des flocons.
19
+
20
+ Les bois sont beaux, sombres et profonds,
21
+ Mais j'ai des promesses à tenir
22
+ Et un long chemin à parcourir avant de dormir,
23
+ Et un long chemin à parcourir avant de dormir.
@@ -1 +1,21 @@
1
- Любя, съешь щипцы, вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг! Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг! Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг! Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен хвощ! Эх, чужак! Общий съём цен шляп (юфть) — вдрызг!Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч. Шеф взъярён тчк щипцы с эхом гудбай Жюль. Эй, жлоб! Где туз? Прячь юных съёмщиц в шкаф. Экс-граф? Плюш изъят. Бьём чуждый цен
1
+ Остановившись у леса снежным вечером
2
+
3
+ Чей лес, мне кажется, я знаю:
4
+ в селе живет его хозяин.
5
+ Он не увидит, как на снежный
6
+ я лес его стою взираю.
7
+
8
+ В недоуменье конь, конечно,
9
+ зачем в ночи за год темнейшей
10
+ мы стали там, где нет жилья,
11
+ у леса с озером замерзшим.
12
+
13
+ Он, бубенцом слегка звеня,
14
+ как будто бы корит меня,
15
+ да веет слабый ветерок,
16
+ пушистым снегом шелестя.
17
+
18
+ Лес сладок, темен и глубок,
19
+ но в путь пора мне — долг есть долг.
20
+ И ехать долго — сон далек,
21
+ и ехать долго — сон далек.
@@ -1 +1,23 @@
1
- First they ignore you, then they ridicule you, then they fight you, then you win.
1
+ Stopping by Woods on a Snowy Evening
2
+
3
+ By Robert Frost
4
+
5
+ Whose woods these are I think I know.
6
+ His house is in the village though;
7
+ He will not see me stopping here
8
+ To watch his woods fill up with snow.
9
+
10
+ My little horse must think it queer
11
+ To stop without a farmhouse near
12
+ Between the woods and frozen lake
13
+ The darkest evening of the year.
14
+
15
+ He gives his harness bells a shake
16
+ To ask if there is some mistake.
17
+ The only other sound’s the sweep
18
+ Of easy wind and downy flake.
19
+
20
+ The woods are lovely, dark and deep,
21
+ But I have promises to keep,
22
+ And miles to go before I sleep,
23
+ And miles to go before I sleep.
@@ -1 +1,23 @@
1
- First they ignore you, then they ridicule you, then they fight you, then you win.
1
+ Stopping by Woods on a Snowy Evening
2
+
3
+ By Robert Frost
4
+
5
+ Whose woods these are I think I know.
6
+ His house is in the village though;
7
+ He will not see me stopping here
8
+ To watch his woods fill up with snow.
9
+
10
+ My little horse must think it queer
11
+ To stop without a farmhouse near
12
+ Between the woods and frozen lake
13
+ The darkest evening of the year.
14
+
15
+ He gives his harness bells a shake
16
+ To ask if there is some mistake.
17
+ The only other sound’s the sweep
18
+ Of easy wind and downy flake.
19
+
20
+ The woods are lovely, dark and deep,
21
+ But I have promises to keep,
22
+ And miles to go before I sleep,
23
+ And miles to go before I sleep.
@@ -4,165 +4,193 @@ require 'spec_helper'
4
4
  require 'webrick'
5
5
 
6
6
  include WEBrick
7
-
8
- describe Rika::Parser do
9
- before(:all) do
10
- @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
11
- @docx_parser = Rika::Parser.new(file_path("document.docx"))
12
- @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
13
- @image_parser = Rika::Parser.new(file_path("image.jpg"))
14
- @unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
15
- @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
16
- port = 50505
17
- @url = "http://#{Socket.gethostname}:#{port}"
18
- @quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
19
- @t1 = Thread.new do
20
- @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
21
- :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
22
- @server.start
23
- end
24
- @sample_pdf_filespec = file_path("document.pdf")
25
- end
26
7
 
27
- after(:all) do
28
- @t1.exit
29
- end
8
+ describe Rika::Parser do
9
+
10
+ let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
11
+ let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
12
+ let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
13
+ let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
14
+ let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
15
+ let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
16
+ let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
17
+ let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
18
+
19
+ port = 50515
20
+ let (:url) { "http://#{Socket.gethostname}:#{port}" }
21
+
22
+ let (:sample_pdf_filespec) { file_path('document.pdf') }
23
+
24
+ let(:first_line) { ->(string) { string.split("\n").first.strip } }
25
+
26
+ let(:server_runner) do
27
+ # returns a lambda that, when passed an action, will wrap it in an HTTP server
28
+ ->(action) do
29
+ server = nil
30
+ server_thread = Thread.new do
31
+ server = HTTPServer.new(
32
+ Port: port,
33
+ DocumentRoot: dir,
34
+ AccessLog: [],
35
+ Logger: WEBrick::Log::new('/dev/null')
36
+ )
37
+ server.start
38
+ end
39
+
40
+ # Wait for server to become ready on its new thread
41
+ sleep 0.01 while server.nil?
42
+ begin
43
+ action.call
44
+ ensure
45
+ server.shutdown
46
+ server_thread.exit
47
+ end
48
+ end
49
+ end
30
50
 
31
- it "should raise error if file does not exists" do
32
- lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
33
- end
34
51
 
35
- it "should raise error if URL does not exists" do
36
- lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError)
52
+ it 'should raise error if file does not exist' do
53
+ expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
37
54
  end
38
55
 
39
- it "should detect file type without a file extension" do
40
- parser = Rika::Parser.new(file_path("text_file_without_extension"))
41
- parser.metadata["Content-Type"].should == "text/plain; charset=ISO-8859-1"
56
+ it 'should raise error if URL does not exist' do
57
+ unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
58
+ unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
59
+ expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
42
60
  end
43
61
 
44
- it "should not be possible to trick the parser to read a folder with an extension" do
45
- lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
62
+ it 'should detect file type without a file extension' do
63
+ parser = Rika::Parser.new(file_path('text_file_without_extension'))
64
+ expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
46
65
  end
47
66
 
48
67
  describe '#content' do
49
- it "should return the content in a text file" do
50
- @txt_parser.content.strip.should == @quote
68
+ it 'should return the content in a text file' do
69
+ expect(first_line.(txt_parser.content)).to eq(quote_first_line)
51
70
  end
52
71
 
53
- it "should return the content in a docx file" do
54
- @docx_parser.content.should == @quote
72
+ it 'should return the content in a docx file' do
73
+ expect(first_line.(docx_parser.content)).to eq(quote_first_line)
55
74
  end
56
75
 
57
- it "should return the content in a pdf file" do
58
- @pdf_parser.content.should == @quote
76
+ it 'should return the content in a pdf file' do
77
+ expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
59
78
  end
60
79
 
61
- it "should return no content for an image" do
62
- @image_parser.content.should be_empty
80
+ it 'should return no content for an image' do
81
+ expect(image_parser.metadata.keys).to_not be_empty
63
82
  end
64
83
 
65
- it "should only return max content length" do
66
- parser = Rika::Parser.new(file_path("text_file.txt"), 5)
67
- parser.content.should == "First"
84
+ it 'should only return max content length' do
85
+ expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
68
86
  end
69
87
 
70
- it "should only return max content length for file over http" do
71
- parser = Rika::Parser.new(@url + "/document.pdf", 6)
72
- parser.content.should == "First"
88
+ it 'should only return max content length for file over http', focus: true do
89
+ server_runner.call( -> do
90
+ expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
91
+ end)
73
92
  end
74
93
 
75
- it "should be possible to read files over 100k by default" do
76
- parser = Rika::Parser.new(file_path("over_100k_file.txt"))
77
- parser.content.length.should == 101_761
94
+ it 'should return the content from a file over http' do
95
+ server_runner.call( -> do
96
+ content = Rika::Parser.new(File.join(url, 'document.pdf')).content
97
+ expect(first_line.(content)).to eq(quote_first_line)
98
+ end)
78
99
  end
79
100
 
80
- it "should return the content from a file over http" do
81
- parser = Rika::Parser.new(@url + "/document.pdf")
82
- parser.content.should == @quote
83
- end
84
-
85
- it "should return empty string for unknown file" do
86
- @unknown_parser.content.should be_empty
101
+ it 'should return empty string for unknown file' do
102
+ expect(unknown_parser.content).to be_empty
87
103
  end
88
104
  end
89
105
 
90
- # We just test a few of the metadata fields for some common file formats
91
- # to make sure the integration with Apache Tika works. Apache Tika already
106
+ # We just test a few of the metadata fields for some common file formats
107
+ # to make sure the integration with Apache Tika works. Apache Tika already
92
108
  # have tests for all file formats it supports so we won't retest that
93
109
  describe '#metadata' do
94
- it "should return nil if metadata field does not exists" do
95
- @txt_parser.metadata["nonsense"].should be_nil
110
+ it 'should return nil if metadata field does not exist' do
111
+ expect(txt_parser.metadata['nonsense']).to be_nil
96
112
  end
97
113
 
98
- it "should return metadata from a docx file" do
99
- @docx_parser.metadata["Page-Count"].should == "1"
114
+ it 'should return metadata from a docx file' do
115
+ expect(docx_parser.metadata['Page-Count']).to eq('1')
100
116
  end
101
117
 
102
- it "should return metadata from a pdf file" do
103
- @pdf_parser.metadata["title"].should == "A simple title"
118
+ it 'should return metadata from a pdf file' do
119
+ expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
104
120
  end
105
121
 
106
- it "should return metadata from a file over http" do
107
- parser = Rika::Parser.new(@url + "/document.pdf")
108
- parser.metadata["title"].should == "A simple title"
122
+ it 'should return metadata from a file over http', focus: true do
123
+ server_runner.call( -> do
124
+ parser = Rika::Parser.new(File.join(url, 'document.pdf'))
125
+ expect(parser.metadata['Author']).to eq('Robert Frost')
126
+ end)
109
127
  end
110
128
 
111
- it "should return metadata from an image" do
112
- @image_parser.metadata["Image Height"].should == "72 pixels"
113
- @image_parser.metadata["Image Width"].should == "72 pixels"
129
+ it 'should return metadata from an image' do
130
+ expect(image_parser.metadata['Image Height']).to eq('72 pixels')
131
+ expect(image_parser.metadata['Image Width']).to eq('72 pixels')
114
132
  end
115
133
  end
116
134
 
117
135
  describe '#available_metadata' do
118
- it "should return available metadata fields" do
119
- @txt_parser.available_metadata.should_not be_empty
136
+ it 'should return available metadata fields' do
137
+ expect(txt_parser.available_metadata).to_not be_empty
120
138
  end
121
139
 
122
- it "should be an array" do
123
- @txt_parser.available_metadata.is_a?(Array).should == true
140
+ it 'should be an array' do
141
+ expect(txt_parser.available_metadata).to be_an(Array)
124
142
  end
125
143
  end
126
144
 
127
145
  describe '#metadata_exists?' do
128
- it "should return false if metadata does not exists" do
129
- @txt_parser.metadata_exists?("title").should == false
146
+ it 'should return false if metadata does not exist' do
147
+ expect(txt_parser.metadata_exists?('title')).to be false
130
148
  end
131
149
 
132
- it "should return true if metadata exists" do
133
- @docx_parser.metadata_exists?("title").should == true
150
+ it 'should return true if metadata exist' do
151
+ expect(docx_parser.metadata_exists?('title')).to be true
134
152
  end
135
153
  end
136
154
 
137
155
  describe '#media_type' do
138
- it "should return application/pdf for a pdf file" do
139
- @pdf_parser.media_type.should == "application/pdf"
156
+ it 'should return application/pdf for a pdf file' do
157
+ expect(pdf_parser.media_type).to eq('application/pdf')
158
+ end
159
+
160
+ it 'should return text/plain for a txt file' do
161
+ expect(txt_parser.media_type).to eq('text/plain')
140
162
  end
141
163
 
142
- it "should return text/plain for a txt file" do
143
- @txt_parser.media_type.should == "text/plain"
164
+ it 'should return application/pdf for a pdf over http' do
165
+ server_runner.call( -> do
166
+ parser = Rika::Parser.new(File.join(url, 'document.pdf'))
167
+ expect(parser.media_type).to eq('application/pdf')
168
+ end)
144
169
  end
145
170
 
146
- it "should return application/pdf for a pdf over http" do
147
- parser = Rika::Parser.new(@url + "/document.pdf")
148
- parser.media_type.should == "application/pdf"
171
+ it 'should return application/octet-stream for unknown file' do
172
+ expect(unknown_parser.media_type).to eq('application/octet-stream')
149
173
  end
150
174
 
151
- it "should return application/octet-stream for unknown file" do
152
- @unknown_parser.media_type.should == "application/octet-stream"
175
+ it 'should return msword for a doc file' do
176
+ expect(doc_parser.media_type).to eq('application/msword')
177
+ end
178
+
179
+ it 'should return wordprocessingml for a docx file' do
180
+ expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
153
181
  end
154
182
  end
155
183
 
156
184
  describe '#language' do
157
- it "should return the language of the content" do
158
-
159
- ["en", "de", "fr", "ru", "es"].each do |lang|
185
+ it 'should return the language of the content' do
186
+ %w(en de fr ru es).each do |lang|
160
187
  txt = Rika::Parser.new(file_path("#{lang}.txt"))
161
- txt.language.should == lang
188
+ expect(txt.language).to eq(lang)
162
189
  end
163
190
  end
164
191
  end
165
192
 
193
+ # See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
166
194
  describe '#language_is_reasonably_certain?' do
167
195
  it "should return false if lang can't be determined" do
168
196
  lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
@@ -175,19 +203,43 @@ describe Rika::Parser do
175
203
  end
176
204
  end
177
205
 
178
- it "should return valid content using Rika.parse_content" do
179
- content = Rika.parse_content(@sample_pdf_filespec)
180
- (content.should be_a(String)) && (content.should_not be_empty)
206
+ it 'should return valid content using Rika.parse_content' do
207
+ content = Rika.parse_content(sample_pdf_filespec)
208
+ expect(content).to be_a(String)
209
+ expect(content).to_not be_empty
181
210
  end
182
211
 
183
- it "should return valid metadata using Rika.parse_metadata" do
184
- metadata = Rika.parse_metadata(@sample_pdf_filespec)
185
- (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
212
+ it 'should return valid metadata using Rika.parse_metadata' do
213
+ metadata = Rika.parse_metadata(sample_pdf_filespec)
214
+ expect(metadata).to be_a(Hash)
215
+ expect(metadata).to_not be_empty
186
216
  end
187
217
 
188
- it "should return valid content and metadata using Rika.parse_content_and_metadata" do
189
- content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
190
- (content.should be_a(String)) && (content.should_not be_empty) && \
191
- (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
218
+ it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
219
+ content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
220
+ expect(content).to be_a(String)
221
+ expect(content).to_not be_empty
222
+ expect(metadata).to be_a(Hash)
223
+ expect(metadata).to_not be_empty
224
+ end
225
+
226
+ specify 'both means of getting both content and metadata should return the same values' do
227
+ content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
228
+
229
+ h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
230
+ content_2 = h[:content]
231
+ metadata_2 = h[:metadata]
232
+
233
+ expect(content_1).to eq(content_2)
234
+ expect(metadata_1).to eq(metadata_2)
235
+ end
236
+
237
+ specify 'getting content and metadata individually and together should return the same values' do
238
+ content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
239
+ content_2 = Rika.parse_content(sample_pdf_filespec)
240
+ metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
241
+
242
+ expect(content_1).to eq(content_2)
243
+ expect(metadata_1).to eq(metadata_2)
192
244
  end
193
245
  end