picolena 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +28 -15
  3. data/README.txt +1 -1
  4. data/config/files_to_clean +2 -1
  5. data/config/hoe.rb +1 -1
  6. data/lib/picolena/config/basic.rb +46 -35
  7. data/lib/picolena/config/icons_and_filetypes.yml +69 -0
  8. data/lib/picolena/config/indexed_directories.yml +1 -1
  9. data/lib/picolena/picolena_generator.rb +3 -1
  10. data/lib/picolena/templates/app/controllers/application.rb +2 -2
  11. data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
  12. data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
  13. data/lib/picolena/templates/app/models/document.rb +32 -14
  14. data/lib/picolena/templates/app/models/finder.rb +21 -78
  15. data/lib/picolena/templates/app/models/index_reader.rb +56 -0
  16. data/lib/picolena/templates/app/models/index_writer.rb +36 -0
  17. data/lib/picolena/templates/app/models/indexer.rb +142 -0
  18. data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
  19. data/lib/picolena/templates/app/models/query.rb +31 -0
  20. data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
  21. data/lib/picolena/templates/config/environment.rb +2 -2
  22. data/lib/picolena/templates/config/environments/development.rb +1 -1
  23. data/lib/picolena/templates/config/environments/production.rb +1 -1
  24. data/lib/picolena/templates/config/environments/test.rb +1 -1
  25. data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
  26. data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
  27. data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
  28. data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
  29. data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
  30. data/lib/picolena/templates/lib/core_exts.rb +20 -1
  31. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
  32. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
  33. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
  34. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
  35. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
  36. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
  37. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
  38. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
  39. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
  40. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
  41. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
  42. data/lib/picolena/templates/lib/tasks/index.rake +4 -6
  43. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  44. data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
  45. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
  46. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
  47. data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
  48. data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
  49. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
  50. data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
  51. data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
  52. data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
  53. data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
  54. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
  55. data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
  56. data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
  57. data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
  58. data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
  59. data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
  60. data/lib/picolena/version.rb +1 -1
  61. data/tasks/hack.rake +2 -1
  62. data/website/index.html +2 -2
  63. data.tar.gz.sig +0 -0
  64. metadata +30 -17
  65. metadata.gz.sig +0 -0
  66. data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
  67. data/lib/picolena/templates/lib/ff.rb +0 -117
  68. data/lib/picolena/templates/lib/filter.rb +0 -75
  69. data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
  70. data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
@@ -0,0 +1,86 @@
1
+ Verde que te quiero verde.
2
+ Verde viento. Verdes ramas.
3
+ El barco sobre la mar
4
+ y el caballo en la montaña.
5
+ Con la sombra en la cintura
6
+ ella sueña en su baranda,
7
+ verde carne, pelo verde,
8
+ con ojos de fría plata.
9
+ Verde que te quiero verde.
10
+ Bajo la luna gitana,
11
+ las cosas la están mirando
12
+ y ella no puede mirarlas.
13
+ Verde que te quiero verde.
14
+ Grandes estrellas de escarcha
15
+ vienen con el pez de sombra
16
+ que abre el camino del alba.
17
+ La higuera frota su viento
18
+ con la lija de sus ramas,
19
+ y el monte, gato garduño,
20
+ eriza sus pitas agrias.
21
+ ¿Pero quién vendra? ¿Y por dónde...?
22
+ Ella sigue en su baranda,
23
+ Verde came, pelo verde,
24
+ soñando en la mar amarga.
25
+ --Compadre, quiero cambiar
26
+ mi caballo por su casa,
27
+ mi montura por su espejo,
28
+ mi cuchillo per su manta.
29
+ Compadre, vengo sangrando,
30
+ desde los puertos de Cabra.
31
+ --Si yo pudiera, mocito,
32
+ este trato se cerraba.
33
+ Pero yo ya no soy yo,
34
+ ni mi casa es ya mi casa.
35
+ --Compadre, quiero morir
36
+ decentemente en mi cama.
37
+ De acero, si puede ser,
38
+ con las sábanas de holanda.
39
+ ¿No ves la herida que tengo
40
+ desde el pecho a la garganta?
41
+ --Trescientas rosas morenas
42
+ lleva tu pechera blanca.
43
+ Tu sangre rezuma y huele
44
+ alrededor de tu faja.
45
+ Pero yo ya no soy yo,
46
+ ni mi casa es ya mi casa.
47
+ --Dejadme subir al menos
48
+ hasta las altas barandas;
49
+ ¡dejadme subir!, dejadme,
50
+ hasta las verdes barandas.
51
+ Barandales de la luna
52
+ por donde retumba el agua.
53
+ Ya suben los dos compadres
54
+ hacia las altas barandas.
55
+ Dejando un rastro de sangre.
56
+ Dejando un rastro de lágrimas.
57
+ Temblaban en los tejados
58
+ farolillos de hojalata.
59
+ Mil panderos de cristal
60
+ herían la madrugada.
61
+ Verde que te quiero verde,
62
+ verde viento, verdes ramas.
63
+ Los dos compadres subieron.
64
+ El largo viento dejaba
65
+ en la boca un raro gusto
66
+ de hiel, de menta y de albahaca.
67
+ ¡Compadre! ¿Donde está, díme?
68
+ ¿Donde está tu niña amarga?
69
+ ¡Cuántas veces te esperó!
70
+ ¡Cuántas veces te esperara,
71
+ cara fresca, negro pelo,
72
+ en esta verde baranda!
73
+ Sobre el rostro del aljibe
74
+ se mecía la gitana.
75
+ Verde carne, pelo verde,
76
+ con ojos de fría plata.
77
+ Un carámbano de luna
78
+ la sostiene sobre el agua.
79
+ La noche se puso íntima
80
+ como una pequeña plaza.
81
+ Guardias civiles borrachos
82
+ en la puerta golpeaban.
83
+ Verde que te qinero verde.
84
+ Verde viento. Verdes ramas.
85
+ El barco sobre la mar.
86
+ Y el caballo en la montaña.
@@ -0,0 +1,90 @@
1
+ THE PHOENIX AND THE TURTLE
2
+ A POEM BY
3
+ WILLIAM SHAKESPEARE
4
+
5
+ The Phoenix and the Turtle
6
+ Let the bird of loudest lay
7
+ On the sole Arabian tree,
8
+ Herald sad and trumpet be,
9
+ To whose sound chaste wings obey.
10
+ But thou shrieking harbinger,
11
+ Foul precurrer of the fiend,
12
+ Augur of the fever's end,
13
+ To this troop come thou not near.
14
+
15
+ From this session interdict
16
+ Every fowl of tyrant wing
17
+ Save the eagle, feather'd king:
18
+ Keep the obsequy so strict.
19
+
20
+ Let the priest in surplice white
21
+ That defunctive music can,
22
+ Be the death-divining swan,
23
+ Lest the requiem lack his right.
24
+
25
+ And thou, treble-dated crow,
26
+ That thy sable gender mak'st
27
+ With the breath thou giv'st and tak'st,
28
+ 'Mongst our mourners shalt thou go.
29
+
30
+ Here the anthem doth commence:—
31
+ Love and constancy is dead;
32
+ Phoenix and the turtle fled
33
+ In a mutual flame from hence.
34
+
35
+ So they loved, as love in twain
36
+ Had the essence but in one;
37
+ Two distincts, division none;
38
+ Number there in love was slain.
39
+
40
+ Hearts remote, yet not asunder;
41
+ Distance, and no space was seen
42
+ 'Twixt the turtle and his queen:
43
+ But in them it were a wonder.
44
+
45
+ So between them love did shine,
46
+ That the turtle saw his right
47
+ Flaming in the phoenix' sight;
48
+ Either was the other's mine.
49
+
50
+ Property was thus appall'd,
51
+ That the self was not the same;
52
+ Single nature's double name
53
+ Neither two nor one was call'd.
54
+
55
+ Reason, in itself confounded,
56
+ Saw division grow together;
57
+ To themselves yet either neither;
58
+ Simple were so well compounded,
59
+
60
+ That it cried, 'How true a twain
61
+ Seemeth this concordant one!
62
+ Love hath reason, reason none
63
+ If what parts can so remain.'
64
+
65
+ Whereupon it made this threne
66
+ To the phoenix and the dove,
67
+ Co-supremes and stars of love,
68
+ As chorus to their tragic scene.
69
+
70
+ THRENOS
71
+
72
+ BEAUTY, truth, and rarity,
73
+ Grace in all simplicity,
74
+ Here enclosed in cinders lie.
75
+
76
+ Death is now the phoenix' nest;
77
+ And the turtle's loyal breast
78
+ To eternity doth rest,
79
+
80
+ Leaving no posterity:
81
+ 'Twas not their infirmity,
82
+ It was married chastity.
83
+
84
+ Truth may seem, but cannot be;
85
+ Beauty brag, but 'tis not she;
86
+ Truth and beauty buried be.
87
+
88
+ To this urn let those repair
89
+ That are either true or fair;
90
+ For these dead birds sigh a prayer.
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 1
5
+ TINY = 2
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/tasks/hack.rake CHANGED
@@ -2,7 +2,8 @@ desc 'Create development picolena structure inside lib/picolena/templates'
2
2
  task :lets_hack do
3
3
  picolena_root=File.join(File.dirname(__FILE__),'..')
4
4
  Dir.chdir(picolena_root){
5
- system("ruby bin/picolena lib/picolena/templates/spec/test_dirs --skip --no-index --no-spec --destination=lib/picolena/templates")
5
+ # Doesn't overwrite any file, Doesn't create any index, Doesn't launch any spec.
6
+ system("ruby bin/picolena lib/picolena/templates/spec/test_dirs/indexed --skip --no-index --no-spec --destination=lib/picolena/templates")
6
7
  }
7
8
  puts <<-EXPLAIN
8
9
 
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.1</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.2</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
@@ -114,7 +114,7 @@ ruby script/server</code></pre>
114
114
 
115
115
  <p>Comments are welcome. Send an email to <a href="mailto:eric_duminil@rubyforge.org">Eric Duminil</a> email via the <a href="http://groups.google.com/group/picolena">forum</a></p>
116
116
  <p class="coda">
117
- <a href="eric_duminil@rubyforge.org">Eric DUMINIL</a>, 12th April 2008<br>
117
+ <a href="eric_duminil@rubyforge.org">Eric DUMINIL</a>, 20th April 2008<br>
118
118
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>,
119
119
  by Daniel Cadenas via <a href="http://depgraph.rubyforge.org/">DepGraph</a>
120
120
  </p>
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-12 00:00:00 +02:00
33
+ date: 2008-04-20 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -143,6 +143,7 @@ files:
143
143
  - config/requirements.rb
144
144
  - lib/picolena/USAGE
145
145
  - lib/picolena/config/basic.rb
146
+ - lib/picolena/config/icons_and_filetypes.yml
146
147
  - lib/picolena/config/indexed_directories.yml
147
148
  - lib/picolena/config/title_and_names_and_links.yml
148
149
  - lib/picolena/config/white_list_ip.yml
@@ -155,6 +156,11 @@ files:
155
156
  - lib/picolena/templates/app/helpers/documents_helper.rb
156
157
  - lib/picolena/templates/app/models/document.rb
157
158
  - lib/picolena/templates/app/models/finder.rb
159
+ - lib/picolena/templates/app/models/index_reader.rb
160
+ - lib/picolena/templates/app/models/index_writer.rb
161
+ - lib/picolena/templates/app/models/indexer.rb
162
+ - lib/picolena/templates/app/models/plain_text_extractor.rb
163
+ - lib/picolena/templates/app/models/query.rb
158
164
  - lib/picolena/templates/app/views/documents/_document.html.haml
159
165
  - lib/picolena/templates/app/views/documents/cached.html.haml
160
166
  - lib/picolena/templates/app/views/documents/content.html.haml
@@ -169,27 +175,26 @@ files:
169
175
  - lib/picolena/templates/config/initializers/001_load_custom_config.rb
170
176
  - lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb
171
177
  - lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
172
- - lib/picolena/templates/config/initializers/004_load_filters.rb
178
+ - lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
173
179
  - lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
180
+ - lib/picolena/templates/config/initializers/006_load_icons.rb
174
181
  - lib/picolena/templates/config/routes.rb
175
182
  - lib/picolena/templates/lang/ui/de.yml
176
183
  - lib/picolena/templates/lang/ui/en.yml
177
184
  - lib/picolena/templates/lang/ui/es.yml
178
185
  - lib/picolena/templates/lang/ui/fr.yml
179
186
  - lib/picolena/templates/lib/core_exts.rb
180
- - lib/picolena/templates/lib/ff.rb
181
- - lib/picolena/templates/lib/filter.rb
182
- - lib/picolena/templates/lib/filter_DSL.rb
183
- - lib/picolena/templates/lib/filters/adobe.pdf.rb
184
- - lib/picolena/templates/lib/filters/html.rb
185
- - lib/picolena/templates/lib/filters/ms.excel.rb
186
- - lib/picolena/templates/lib/filters/ms.powerpoint.rb
187
- - lib/picolena/templates/lib/filters/ms.rtf.rb
188
- - lib/picolena/templates/lib/filters/ms.word.rb
189
- - lib/picolena/templates/lib/filters/opendocument.presentation.rb
190
- - lib/picolena/templates/lib/filters/opendocument.spreadsheet.rb
191
- - lib/picolena/templates/lib/filters/opendocument.text.rb
192
- - lib/picolena/templates/lib/filters/plain_text.rb
187
+ - lib/picolena/templates/lib/plain_text_extractor_DSL.rb
188
+ - lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
189
+ - lib/picolena/templates/lib/plain_text_extractors/html.rb
190
+ - lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
191
+ - lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
192
+ - lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
193
+ - lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
194
+ - lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
195
+ - lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
196
+ - lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
197
+ - lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
193
198
  - lib/picolena/templates/lib/tasks/annotations.rake
194
199
  - lib/picolena/templates/lib/tasks/index.rake
195
200
  - lib/picolena/templates/lib/tasks/install_dependencies.rake
@@ -246,10 +251,14 @@ files:
246
251
  - lib/picolena/templates/spec/helpers/documents_helper_spec.rb
247
252
  - lib/picolena/templates/spec/models/basic_finder_spec.rb
248
253
  - lib/picolena/templates/spec/models/document_spec.rb
249
- - lib/picolena/templates/spec/models/filters_spec.rb
250
254
  - lib/picolena/templates/spec/models/finder_spec.rb
251
255
  - lib/picolena/templates/spec/models/host_indexing_system_spec.rb
252
256
  - lib/picolena/templates/spec/models/index_directories_spec.rb
257
+ - lib/picolena/templates/spec/models/index_reader_spec.rb
258
+ - lib/picolena/templates/spec/models/index_writer_spec.rb
259
+ - lib/picolena/templates/spec/models/indexer_spec.rb
260
+ - lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
261
+ - lib/picolena/templates/spec/models/query_spec.rb
253
262
  - lib/picolena/templates/spec/rcov.opts
254
263
  - lib/picolena/templates/spec/spec.opts
255
264
  - lib/picolena/templates/spec/spec_helper.rb
@@ -274,6 +283,10 @@ files:
274
283
  - lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-15.txt
275
284
  - lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt
276
285
  - lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt
286
+ - lib/picolena/templates/spec/test_dirs/indexed/lang/goethe
287
+ - lib/picolena/templates/spec/test_dirs/indexed/lang/hugo
288
+ - lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
289
+ - lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
277
290
  - lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
278
291
  - lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
279
292
  - lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
metadata.gz.sig CHANGED
Binary file
@@ -1,6 +0,0 @@
1
- require 'core_exts'
2
- require 'filter'
3
-
4
- Dir.glob(File.join(RAILS_ROOT,'lib/filters/*.rb')).each{|filter|
5
- require filter
6
- }
@@ -1,117 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # ff - Search and index document files using Ferret
4
- #
5
- # Author: Stuart Rackham <srackham@methods.co.nz>
6
- # License: This source code is released under the MIT license.
7
- # Home page: http://www.methods.co.nz/ff/
8
- #
9
- # Requisites:
10
- # - Ferret 0.10.4 or better installed as a Ruby Gem.
11
- # See http://ferret.davebalmain.com/trac for Ferret installation.
12
- # - External text file filters documented in lib/filters/*.rb.
13
-
14
- #TODO: Not Rubyish at all. Refactor all this!
15
-
16
- Analyzer=Ferret::Analysis::StandardAnalyzer.new
17
-
18
- # Add file +filename+ to the +index+.
19
- def index_file(index, filename, mime_type=nil)
20
- complete_path=File.expand_path(filename)
21
- fields = {
22
- :complete_path=> complete_path,
23
- :probably_unique_id => complete_path.base26_hash,
24
- :file => File.basename(filename),
25
- :basename => File.basename(filename, File.extname(filename)).gsub(/_/,' '),
26
- :filetype => File.extname(filename),
27
- :date => File.mtime(filename).strftime("%Y%m%d%H%M")
28
- }
29
-
30
- if mime_type then
31
- text = PlainText.extract_content_from(filename)
32
- raise "empty document #{filename}" if text.strip.empty?
33
- fields[:content] = text
34
- end
35
-
36
- index << fields
37
- end
38
-
39
- def index_file_and_increment_counter(index,filename,mime_type,counters)
40
- counters[mime_type] ||= Struct::Counter.new(0,0,0,0)
41
- counters[mime_type].count += 1
42
- counters[mime_type].size += File.size(filename)
43
- start=Time.now
44
- index_file(index, filename,mime_type)
45
- counters[mime_type].time_needed += Time.now-start
46
- end
47
-
48
- # Recursively add all qualifying files in directory +dir+ to +index+.
49
- def index_directory(index, dir, counters)
50
- #Index just everything!
51
- Dir.glob(File.join(dir,"**/*")) do |filename|
52
- # Skip Thumbs.db files
53
- if File.file?(filename) and not filename =~ /(Thumbs\.db)/
54
- begin
55
- IndexLogger.debug "indexing: #{filename}"
56
-
57
- # Trying to guess MIME type from file contents is not reliable for text
58
- # files. The strategy used here is to infer from file name extension
59
- # and rely on the convertor routine to fail if type is incorrect.
60
- mime_type = File.mime(filename)
61
- index_file_and_increment_counter(index,filename,mime_type,counters)
62
- rescue => e
63
- # if mime is unknown, just index filename, basename and extension
64
- IndexLogger.debug "indexing without content: #{e.message}"
65
- index_file(index, filename)
66
- counters[mime_type||'Unknown mime type'].without_content += 1
67
- end
68
- end
69
- end
70
- end
71
-
72
- def create_index(dirs)
73
- FileUtils.mkpath File.dirname(IndexSavePath)
74
- index = Ferret::Index::IndexWriter.new(:create => true, :path => IndexSavePath, :analyzer => Analyzer)
75
-
76
- add_fields(index)
77
-
78
- Struct.new('Counter', :size, :count, :without_content, :time_needed) unless Struct.constants.include?("Counter")
79
- counters = {}
80
- begin
81
- dirs.each { |dir| index_directory(index, dir, counters) }
82
- index.optimize
83
- ensure
84
- index.close
85
- end
86
- counters.each_pair do |key,value|
87
- IndexLogger.info "\n#{key}:"
88
- IndexLogger.info "files indexed: #{value.count} (#{value.size} bytes)"
89
- IndexLogger.info("files without_content: #{value.without_content}") unless value.without_content.zero?
90
- unless value.count.zero? or value.without_content==value.count then
91
- IndexLogger.info "time needed: #{(value.time_needed*1000).to_i} ms"
92
- IndexLogger.info "avg. time needed: #{(value.time_needed*1000/(value.count-value.without_content)).to_i} ms/file"
93
- end
94
- end
95
- total_count = counters.values.inject(0) {|sum,count| sum + count.count}
96
- total_size = counters.values.inject(0) {|sum,count| sum + count.size}
97
- total_without_content = counters.values.inject(0) {|sum,count| sum + count.without_content}
98
- total_time_needed = counters.values.inject(0) {|sum,count| sum + count.time_needed}
99
- IndexLogger.info "\ntotal files indexed: #{total_count} (#{total_size} bytes)"
100
- IndexLogger.info("total files without_content: #{total_without_content}") unless total_without_content.zero?
101
- unless total_count.zero? or total_count==total_without_content then
102
- IndexLogger.info "total time needed: #{(total_time_needed*1000).to_i} ms"
103
- IndexLogger.info "avg. time needed: #{(total_time_needed*1000/(total_count-total_without_content)).to_i} ms/file"
104
- end
105
- end
106
-
107
- def add_fields(index)
108
- # Although not intuitively obvious, until I (Stuart Rackham) tokenized the file name, wildcard
109
- # file name searches did not return all matching documents.
110
- index.field_infos.add_field(:complete_path, :store => :yes, :index => :yes)
111
- index.field_infos.add_field(:content, :store => :yes, :index => :yes)
112
- index.field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
113
- index.field_infos.add_field(:file, :store => :no, :index => :yes, :boost => 1.5)
114
- index.field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
115
- index.field_infos.add_field(:date, :store=>:yes, :index=>:yes)
116
- index.field_infos.add_field(:probably_unique_id, :store=>:no, :index=>:yes)
117
- end
@@ -1,75 +0,0 @@
1
- require 'filter_DSL'
2
-
3
- module PlainText
4
- @@filters=[]
5
-
6
- #returns every defined filter
7
- def self.filters
8
- @@filters
9
- end
10
-
11
- #returns every required dependency for every defined filter
12
- def self.filter_dependencies
13
- @@dependencies||=filters.collect{|filter| filter.dependencies}.flatten.compact.uniq.sort
14
- end
15
-
16
- #returns every supported file extensions
17
- def self.supported_extensions
18
- @@supported_exts||=filters.collect{|filter| filter.exts}.flatten.compact.uniq
19
- end
20
-
21
- #finds which filter should be used for a given file, according to its extension
22
- def self.find_filter_for(filename)
23
- ext=File.ext_as_sym(filename)
24
- filter=filters.find{|filter| filter.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
25
- filter.source=filename
26
- filter
27
- end
28
-
29
- #launches filter on given file and outputs plain text result
30
- def self.extract_content_from(source)
31
- find_filter_for(source).extract_content
32
- end
33
-
34
-
35
- class Filter
36
- attr_accessor :source
37
-
38
- #parses command in order to know which programs are needed.
39
- #rspec will then check that every dependecy is installed on the system
40
- def dependencies
41
- if command.is_a?(String) then
42
- command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
43
- else
44
- @dependencies
45
- end
46
- end
47
-
48
- #Conversion part
49
-
50
- #destination method can be used by some conversion command that cannot output to stdout (example?)
51
- #a file containing plain text result will first be written by command, and then be read by extract_content.
52
- def destination
53
- require 'tmpdir'
54
- @@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}")
55
- end
56
-
57
- #Replaces generic command with specific source and destination (if specified) files
58
- def specific_command
59
- command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"')
60
- end
61
-
62
- def extract_content
63
- if command.is_a?(String) then
64
- if command.include?('DESTINATION') then
65
- system(specific_command)
66
- File.read_and_remove(destination)
67
- else
68
- IO.popen(specific_command){|io| io.read}
69
- end
70
- else
71
- command.call(source)
72
- end
73
- end
74
- end
75
- end
@@ -1,77 +0,0 @@
1
- #Module used to define Filters with DSL
2
- #For example, to convert "Microsoft Office Word document" to plain text
3
- # PlainText.extract {
4
- # from :doc, :dot
5
- # as "application/msword"
6
- # aka "Microsoft Office Word document"
7
- # with "antiword SOURCE > DESTINATION 2>/dev/null" => :on_linux, "some other command" => :on_windows
8
- # which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
9
- # }
10
- module PlainText
11
- #defines a new Filter with DSL
12
- def self.extract(&block)
13
- filter = Filter.new
14
- filter.instance_eval(&block)
15
- @@filters<<filter
16
- MimeType.add(filter.exts,filter.mime_name)
17
- end
18
-
19
- #defined by DSL described in PlainText
20
- class Filter
21
- attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples
22
-
23
- def initialize
24
- @content_and_file_examples=[]
25
- end
26
-
27
- def from(*exts)
28
- @exts=exts
29
- end
30
-
31
- def as(mime_name)
32
- @mime_name=mime_name
33
- end
34
-
35
- def aka(description)
36
- @description=description
37
- end
38
-
39
- def which_requires(*dependencies)
40
- @dependencies=dependencies
41
- end
42
-
43
- #used by rspec to test filters:
44
- # which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
45
- # or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
46
- #
47
- #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
48
- #and if plain text output from the filter applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
49
- def which_should_for_example_extract(content, file)
50
- @content_and_file_examples << [content,file[:from]]
51
- end
52
-
53
- #it allows to define specs in this way:
54
- # which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
55
- # or_extract 'text inside!', :from => 'crossed.txt'
56
- alias_method :or_extract, :which_should_for_example_extract
57
-
58
- def with(command_as_hash_or_string=nil,&block)
59
- platform=case RUBY_PLATFORM
60
- when /linux/
61
- :on_linux
62
- when /win/
63
- :on_windows
64
- end
65
- @command=case command_as_hash_or_string
66
- when String
67
- command_as_hash_or_string
68
- when Hash
69
- #dup must be used, otherwise @command gets frozen. No idea why though....
70
- command_as_hash_or_string.invert[platform].dup
71
- else
72
- block || raise("No command defined for this filter: #{description}")
73
- end
74
- @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
75
- end
76
- end
77
- end
@@ -1,30 +0,0 @@
1
- require File.dirname(__FILE__) + '/../spec_helper'
2
-
3
- describe "Filters" do
4
- before(:all) do
5
- Finder.ensure_that_index_exists_on_disk
6
- end
7
-
8
- PlainText.filters.each{|filter|
9
- filter.exts.each{|ext|
10
- should_extract= "should be able to extract content from #{filter.description} (.#{ext})"
11
- content_and_file_examples_for_this_ext=filter.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}
12
- unless content_and_file_examples_for_this_ext.empty? then
13
- it should_extract do
14
- content_and_file_examples_for_this_ext.each{|content_example,file_example|
15
- finder=Finder.new(content_example)
16
- finder.execute!
17
- matching_documents=finder.matching_documents
18
- matching_documents_filenames=matching_documents.collect{|d| d.filename}
19
- matching_documents_filenames.should include(file_example)
20
- }
21
- end
22
- else
23
- ## It means that the spec for this extension file is "Not yet implemented"!
24
- ## add this line to the corresponding filter in lib/filters:
25
- # which_should_for_example_extract 'some content', :from => 'a file you could add in spec/test_dirs/indexed/'
26
- it should_extract
27
- end
28
- }
29
- }
30
- end