picolena 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/Manifest.txt +28 -15
- data/README.txt +1 -1
- data/config/files_to_clean +2 -1
- data/config/hoe.rb +1 -1
- data/lib/picolena/config/basic.rb +46 -35
- data/lib/picolena/config/icons_and_filetypes.yml +69 -0
- data/lib/picolena/config/indexed_directories.yml +1 -1
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/controllers/application.rb +2 -2
- data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
- data/lib/picolena/templates/app/models/document.rb +32 -14
- data/lib/picolena/templates/app/models/finder.rb +21 -78
- data/lib/picolena/templates/app/models/index_reader.rb +56 -0
- data/lib/picolena/templates/app/models/index_writer.rb +36 -0
- data/lib/picolena/templates/app/models/indexer.rb +142 -0
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
- data/lib/picolena/templates/app/models/query.rb +31 -0
- data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
- data/lib/picolena/templates/config/environment.rb +2 -2
- data/lib/picolena/templates/config/environments/development.rb +1 -1
- data/lib/picolena/templates/config/environments/production.rb +1 -1
- data/lib/picolena/templates/config/environments/test.rb +1 -1
- data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
- data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
- data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
- data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
- data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
- data/lib/picolena/templates/lib/core_exts.rb +20 -1
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
- data/lib/picolena/templates/lib/tasks/index.rake +4 -6
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
- data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
- data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
- data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
- data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
- data/lib/picolena/version.rb +1 -1
- data/tasks/hack.rake +2 -1
- data/website/index.html +2 -2
- data.tar.gz.sig +0 -0
- metadata +30 -17
- metadata.gz.sig +0 -0
- data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
- data/lib/picolena/templates/lib/ff.rb +0 -117
- data/lib/picolena/templates/lib/filter.rb +0 -75
- data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
- data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
@@ -0,0 +1,86 @@
|
|
1
|
+
Verde que te quiero verde.
|
2
|
+
Verde viento. Verdes ramas.
|
3
|
+
El barco sobre la mar
|
4
|
+
y el caballo en la montaña.
|
5
|
+
Con la sombra en la cintura
|
6
|
+
ella sueña en su baranda,
|
7
|
+
verde carne, pelo verde,
|
8
|
+
con ojos de fría plata.
|
9
|
+
Verde que te quiero verde.
|
10
|
+
Bajo la luna gitana,
|
11
|
+
las cosas la están mirando
|
12
|
+
y ella no puede mirarlas.
|
13
|
+
Verde que te quiero verde.
|
14
|
+
Grandes estrellas de escarcha
|
15
|
+
vienen con el pez de sombra
|
16
|
+
que abre el camino del alba.
|
17
|
+
La higuera frota su viento
|
18
|
+
con la lija de sus ramas,
|
19
|
+
y el monte, gato garduño,
|
20
|
+
eriza sus pitas agrias.
|
21
|
+
¿Pero quién vendra? ¿Y por dónde...?
|
22
|
+
Ella sigue en su baranda,
|
23
|
+
Verde came, pelo verde,
|
24
|
+
soñando en la mar amarga.
|
25
|
+
--Compadre, quiero cambiar
|
26
|
+
mi caballo por su casa,
|
27
|
+
mi montura por su espejo,
|
28
|
+
mi cuchillo per su manta.
|
29
|
+
Compadre, vengo sangrando,
|
30
|
+
desde los puertos de Cabra.
|
31
|
+
--Si yo pudiera, mocito,
|
32
|
+
este trato se cerraba.
|
33
|
+
Pero yo ya no soy yo,
|
34
|
+
ni mi casa es ya mi casa.
|
35
|
+
--Compadre, quiero morir
|
36
|
+
decentemente en mi cama.
|
37
|
+
De acero, si puede ser,
|
38
|
+
con las sábanas de holanda.
|
39
|
+
¿No ves la herida que tengo
|
40
|
+
desde el pecho a la garganta?
|
41
|
+
--Trescientas rosas morenas
|
42
|
+
lleva tu pechera blanca.
|
43
|
+
Tu sangre rezuma y huele
|
44
|
+
alrededor de tu faja.
|
45
|
+
Pero yo ya no soy yo,
|
46
|
+
ni mi casa es ya mi casa.
|
47
|
+
--Dejadme subir al menos
|
48
|
+
hasta las altas barandas;
|
49
|
+
¡dejadme subir!, dejadme,
|
50
|
+
hasta las verdes barandas.
|
51
|
+
Barandales de la luna
|
52
|
+
por donde retumba el agua.
|
53
|
+
Ya suben los dos compadres
|
54
|
+
hacia las altas barandas.
|
55
|
+
Dejando un rastro de sangre.
|
56
|
+
Dejando un rastro de lágrimas.
|
57
|
+
Temblaban en los tejados
|
58
|
+
farolillos de hojalata.
|
59
|
+
Mil panderos de cristal
|
60
|
+
herían la madrugada.
|
61
|
+
Verde que te quiero verde,
|
62
|
+
verde viento, verdes ramas.
|
63
|
+
Los dos compadres subieron.
|
64
|
+
El largo viento dejaba
|
65
|
+
en la boca un raro gusto
|
66
|
+
de hiel, de menta y de albahaca.
|
67
|
+
¡Compadre! ¿Donde está, díme?
|
68
|
+
¿Donde está tu niña amarga?
|
69
|
+
¡Cuántas veces te esperó!
|
70
|
+
¡Cuántas veces te esperara,
|
71
|
+
cara fresca, negro pelo,
|
72
|
+
en esta verde baranda!
|
73
|
+
Sobre el rostro del aljibe
|
74
|
+
se mecía la gitana.
|
75
|
+
Verde carne, pelo verde,
|
76
|
+
con ojos de fría plata.
|
77
|
+
Un carámbano de luna
|
78
|
+
la sostiene sobre el agua.
|
79
|
+
La noche se puso íntima
|
80
|
+
como una pequeña plaza.
|
81
|
+
Guardias civiles borrachos
|
82
|
+
en la puerta golpeaban.
|
83
|
+
Verde que te qinero verde.
|
84
|
+
Verde viento. Verdes ramas.
|
85
|
+
El barco sobre la mar.
|
86
|
+
Y el caballo en la montaña.
|
@@ -0,0 +1,90 @@
|
|
1
|
+
THE PHOENIX AND THE TURTLE
|
2
|
+
A POEM BY
|
3
|
+
WILLIAM SHAKESPEARE
|
4
|
+
|
5
|
+
The Phoenix and the Turtle
|
6
|
+
Let the bird of loudest lay
|
7
|
+
On the sole Arabian tree,
|
8
|
+
Herald sad and trumpet be,
|
9
|
+
To whose sound chaste wings obey.
|
10
|
+
But thou shrieking harbinger,
|
11
|
+
Foul precurrer of the fiend,
|
12
|
+
Augur of the fever's end,
|
13
|
+
To this troop come thou not near.
|
14
|
+
|
15
|
+
From this session interdict
|
16
|
+
Every fowl of tyrant wing
|
17
|
+
Save the eagle, feather'd king:
|
18
|
+
Keep the obsequy so strict.
|
19
|
+
|
20
|
+
Let the priest in surplice white
|
21
|
+
That defunctive music can,
|
22
|
+
Be the death-divining swan,
|
23
|
+
Lest the requiem lack his right.
|
24
|
+
|
25
|
+
And thou, treble-dated crow,
|
26
|
+
That thy sable gender mak'st
|
27
|
+
With the breath thou giv'st and tak'st,
|
28
|
+
'Mongst our mourners shalt thou go.
|
29
|
+
|
30
|
+
Here the anthem doth commence:—
|
31
|
+
Love and constancy is dead;
|
32
|
+
Phoenix and the turtle fled
|
33
|
+
In a mutual flame from hence.
|
34
|
+
|
35
|
+
So they loved, as love in twain
|
36
|
+
Had the essence but in one;
|
37
|
+
Two distincts, division none;
|
38
|
+
Number there in love was slain.
|
39
|
+
|
40
|
+
Hearts remote, yet not asunder;
|
41
|
+
Distance, and no space was seen
|
42
|
+
'Twixt the turtle and his queen:
|
43
|
+
But in them it were a wonder.
|
44
|
+
|
45
|
+
So between them love did shine,
|
46
|
+
That the turtle saw his right
|
47
|
+
Flaming in the phoenix' sight;
|
48
|
+
Either was the other's mine.
|
49
|
+
|
50
|
+
Property was thus appall'd,
|
51
|
+
That the self was not the same;
|
52
|
+
Single nature's double name
|
53
|
+
Neither two nor one was call'd.
|
54
|
+
|
55
|
+
Reason, in itself confounded,
|
56
|
+
Saw division grow together;
|
57
|
+
To themselves yet either neither;
|
58
|
+
Simple were so well compounded,
|
59
|
+
|
60
|
+
That it cried, 'How true a twain
|
61
|
+
Seemeth this concordant one!
|
62
|
+
Love hath reason, reason none
|
63
|
+
If what parts can so remain.'
|
64
|
+
|
65
|
+
Whereupon it made this threne
|
66
|
+
To the phoenix and the dove,
|
67
|
+
Co-supremes and stars of love,
|
68
|
+
As chorus to their tragic scene.
|
69
|
+
|
70
|
+
THRENOS
|
71
|
+
|
72
|
+
BEAUTY, truth, and rarity,
|
73
|
+
Grace in all simplicity,
|
74
|
+
Here enclosed in cinders lie.
|
75
|
+
|
76
|
+
Death is now the phoenix' nest;
|
77
|
+
And the turtle's loyal breast
|
78
|
+
To eternity doth rest,
|
79
|
+
|
80
|
+
Leaving no posterity:
|
81
|
+
'Twas not their infirmity,
|
82
|
+
It was married chastity.
|
83
|
+
|
84
|
+
Truth may seem, but cannot be;
|
85
|
+
Beauty brag, but 'tis not she;
|
86
|
+
Truth and beauty buried be.
|
87
|
+
|
88
|
+
To this urn let those repair
|
89
|
+
That are either true or fair;
|
90
|
+
For these dead birds sigh a prayer.
|
data/lib/picolena/version.rb
CHANGED
data/tasks/hack.rake
CHANGED
@@ -2,7 +2,8 @@ desc 'Create development picolena structure inside lib/picolena/templates'
|
|
2
2
|
task :lets_hack do
|
3
3
|
picolena_root=File.join(File.dirname(__FILE__),'..')
|
4
4
|
Dir.chdir(picolena_root){
|
5
|
-
|
5
|
+
# Doesn't overwrite any file, Doesn't create any index, Doesn't launch any spec.
|
6
|
+
system("ruby bin/picolena lib/picolena/templates/spec/test_dirs/indexed --skip --no-index --no-spec --destination=lib/picolena/templates")
|
6
7
|
}
|
7
8
|
puts <<-EXPLAIN
|
8
9
|
|
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.2</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
@@ -114,7 +114,7 @@ ruby script/server</code></pre>
|
|
114
114
|
|
115
115
|
<p>Comments are welcome. Send an email to <a href="mailto:eric_duminil@rubyforge.org">Eric Duminil</a> email via the <a href="http://groups.google.com/group/picolena">forum</a></p>
|
116
116
|
<p class="coda">
|
117
|
-
<a href="eric_duminil@rubyforge.org">Eric DUMINIL</a>,
|
117
|
+
<a href="eric_duminil@rubyforge.org">Eric DUMINIL</a>, 20th April 2008<br>
|
118
118
|
Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>,
|
119
119
|
by Daniel Cadenas via <a href="http://depgraph.rubyforge.org/">DepGraph</a>
|
120
120
|
</p>
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-20 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -143,6 +143,7 @@ files:
|
|
143
143
|
- config/requirements.rb
|
144
144
|
- lib/picolena/USAGE
|
145
145
|
- lib/picolena/config/basic.rb
|
146
|
+
- lib/picolena/config/icons_and_filetypes.yml
|
146
147
|
- lib/picolena/config/indexed_directories.yml
|
147
148
|
- lib/picolena/config/title_and_names_and_links.yml
|
148
149
|
- lib/picolena/config/white_list_ip.yml
|
@@ -155,6 +156,11 @@ files:
|
|
155
156
|
- lib/picolena/templates/app/helpers/documents_helper.rb
|
156
157
|
- lib/picolena/templates/app/models/document.rb
|
157
158
|
- lib/picolena/templates/app/models/finder.rb
|
159
|
+
- lib/picolena/templates/app/models/index_reader.rb
|
160
|
+
- lib/picolena/templates/app/models/index_writer.rb
|
161
|
+
- lib/picolena/templates/app/models/indexer.rb
|
162
|
+
- lib/picolena/templates/app/models/plain_text_extractor.rb
|
163
|
+
- lib/picolena/templates/app/models/query.rb
|
158
164
|
- lib/picolena/templates/app/views/documents/_document.html.haml
|
159
165
|
- lib/picolena/templates/app/views/documents/cached.html.haml
|
160
166
|
- lib/picolena/templates/app/views/documents/content.html.haml
|
@@ -169,27 +175,26 @@ files:
|
|
169
175
|
- lib/picolena/templates/config/initializers/001_load_custom_config.rb
|
170
176
|
- lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb
|
171
177
|
- lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
|
172
|
-
- lib/picolena/templates/config/initializers/
|
178
|
+
- lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
173
179
|
- lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
180
|
+
- lib/picolena/templates/config/initializers/006_load_icons.rb
|
174
181
|
- lib/picolena/templates/config/routes.rb
|
175
182
|
- lib/picolena/templates/lang/ui/de.yml
|
176
183
|
- lib/picolena/templates/lang/ui/en.yml
|
177
184
|
- lib/picolena/templates/lang/ui/es.yml
|
178
185
|
- lib/picolena/templates/lang/ui/fr.yml
|
179
186
|
- lib/picolena/templates/lib/core_exts.rb
|
180
|
-
- lib/picolena/templates/lib/
|
181
|
-
- lib/picolena/templates/lib/
|
182
|
-
- lib/picolena/templates/lib/
|
183
|
-
- lib/picolena/templates/lib/
|
184
|
-
- lib/picolena/templates/lib/
|
185
|
-
- lib/picolena/templates/lib/
|
186
|
-
- lib/picolena/templates/lib/
|
187
|
-
- lib/picolena/templates/lib/
|
188
|
-
- lib/picolena/templates/lib/
|
189
|
-
- lib/picolena/templates/lib/
|
190
|
-
- lib/picolena/templates/lib/
|
191
|
-
- lib/picolena/templates/lib/filters/opendocument.text.rb
|
192
|
-
- lib/picolena/templates/lib/filters/plain_text.rb
|
187
|
+
- lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
188
|
+
- lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
189
|
+
- lib/picolena/templates/lib/plain_text_extractors/html.rb
|
190
|
+
- lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
|
191
|
+
- lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
|
192
|
+
- lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
|
193
|
+
- lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
|
194
|
+
- lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
|
195
|
+
- lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
|
196
|
+
- lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
|
197
|
+
- lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
|
193
198
|
- lib/picolena/templates/lib/tasks/annotations.rake
|
194
199
|
- lib/picolena/templates/lib/tasks/index.rake
|
195
200
|
- lib/picolena/templates/lib/tasks/install_dependencies.rake
|
@@ -246,10 +251,14 @@ files:
|
|
246
251
|
- lib/picolena/templates/spec/helpers/documents_helper_spec.rb
|
247
252
|
- lib/picolena/templates/spec/models/basic_finder_spec.rb
|
248
253
|
- lib/picolena/templates/spec/models/document_spec.rb
|
249
|
-
- lib/picolena/templates/spec/models/filters_spec.rb
|
250
254
|
- lib/picolena/templates/spec/models/finder_spec.rb
|
251
255
|
- lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
252
256
|
- lib/picolena/templates/spec/models/index_directories_spec.rb
|
257
|
+
- lib/picolena/templates/spec/models/index_reader_spec.rb
|
258
|
+
- lib/picolena/templates/spec/models/index_writer_spec.rb
|
259
|
+
- lib/picolena/templates/spec/models/indexer_spec.rb
|
260
|
+
- lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
261
|
+
- lib/picolena/templates/spec/models/query_spec.rb
|
253
262
|
- lib/picolena/templates/spec/rcov.opts
|
254
263
|
- lib/picolena/templates/spec/spec.opts
|
255
264
|
- lib/picolena/templates/spec/spec_helper.rb
|
@@ -274,6 +283,10 @@ files:
|
|
274
283
|
- lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-15.txt
|
275
284
|
- lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt
|
276
285
|
- lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt
|
286
|
+
- lib/picolena/templates/spec/test_dirs/indexed/lang/goethe
|
287
|
+
- lib/picolena/templates/spec/test_dirs/indexed/lang/hugo
|
288
|
+
- lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
|
289
|
+
- lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
|
277
290
|
- lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
|
278
291
|
- lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
|
279
292
|
- lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
|
metadata.gz.sig
CHANGED
Binary file
|
@@ -1,117 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# ff - Search and index document files using Ferret
|
4
|
-
#
|
5
|
-
# Author: Stuart Rackham <srackham@methods.co.nz>
|
6
|
-
# License: This source code is released under the MIT license.
|
7
|
-
# Home page: http://www.methods.co.nz/ff/
|
8
|
-
#
|
9
|
-
# Requisites:
|
10
|
-
# - Ferret 0.10.4 or better installed as a Ruby Gem.
|
11
|
-
# See http://ferret.davebalmain.com/trac for Ferret installation.
|
12
|
-
# - External text file filters documented in lib/filters/*.rb.
|
13
|
-
|
14
|
-
#TODO: Not Rubyish at all. Refactor all this!
|
15
|
-
|
16
|
-
Analyzer=Ferret::Analysis::StandardAnalyzer.new
|
17
|
-
|
18
|
-
# Add file +filename+ to the +index+.
|
19
|
-
def index_file(index, filename, mime_type=nil)
|
20
|
-
complete_path=File.expand_path(filename)
|
21
|
-
fields = {
|
22
|
-
:complete_path=> complete_path,
|
23
|
-
:probably_unique_id => complete_path.base26_hash,
|
24
|
-
:file => File.basename(filename),
|
25
|
-
:basename => File.basename(filename, File.extname(filename)).gsub(/_/,' '),
|
26
|
-
:filetype => File.extname(filename),
|
27
|
-
:date => File.mtime(filename).strftime("%Y%m%d%H%M")
|
28
|
-
}
|
29
|
-
|
30
|
-
if mime_type then
|
31
|
-
text = PlainText.extract_content_from(filename)
|
32
|
-
raise "empty document #{filename}" if text.strip.empty?
|
33
|
-
fields[:content] = text
|
34
|
-
end
|
35
|
-
|
36
|
-
index << fields
|
37
|
-
end
|
38
|
-
|
39
|
-
def index_file_and_increment_counter(index,filename,mime_type,counters)
|
40
|
-
counters[mime_type] ||= Struct::Counter.new(0,0,0,0)
|
41
|
-
counters[mime_type].count += 1
|
42
|
-
counters[mime_type].size += File.size(filename)
|
43
|
-
start=Time.now
|
44
|
-
index_file(index, filename,mime_type)
|
45
|
-
counters[mime_type].time_needed += Time.now-start
|
46
|
-
end
|
47
|
-
|
48
|
-
# Recursively add all qualifying files in directory +dir+ to +index+.
|
49
|
-
def index_directory(index, dir, counters)
|
50
|
-
#Index just everything!
|
51
|
-
Dir.glob(File.join(dir,"**/*")) do |filename|
|
52
|
-
# Skip Thumbs.db files
|
53
|
-
if File.file?(filename) and not filename =~ /(Thumbs\.db)/
|
54
|
-
begin
|
55
|
-
IndexLogger.debug "indexing: #{filename}"
|
56
|
-
|
57
|
-
# Trying to guess MIME type from file contents is not reliable for text
|
58
|
-
# files. The strategy used here is to infer from file name extension
|
59
|
-
# and rely on the convertor routine to fail if type is incorrect.
|
60
|
-
mime_type = File.mime(filename)
|
61
|
-
index_file_and_increment_counter(index,filename,mime_type,counters)
|
62
|
-
rescue => e
|
63
|
-
# if mime is unknown, just index filename, basename and extension
|
64
|
-
IndexLogger.debug "indexing without content: #{e.message}"
|
65
|
-
index_file(index, filename)
|
66
|
-
counters[mime_type||'Unknown mime type'].without_content += 1
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def create_index(dirs)
|
73
|
-
FileUtils.mkpath File.dirname(IndexSavePath)
|
74
|
-
index = Ferret::Index::IndexWriter.new(:create => true, :path => IndexSavePath, :analyzer => Analyzer)
|
75
|
-
|
76
|
-
add_fields(index)
|
77
|
-
|
78
|
-
Struct.new('Counter', :size, :count, :without_content, :time_needed) unless Struct.constants.include?("Counter")
|
79
|
-
counters = {}
|
80
|
-
begin
|
81
|
-
dirs.each { |dir| index_directory(index, dir, counters) }
|
82
|
-
index.optimize
|
83
|
-
ensure
|
84
|
-
index.close
|
85
|
-
end
|
86
|
-
counters.each_pair do |key,value|
|
87
|
-
IndexLogger.info "\n#{key}:"
|
88
|
-
IndexLogger.info "files indexed: #{value.count} (#{value.size} bytes)"
|
89
|
-
IndexLogger.info("files without_content: #{value.without_content}") unless value.without_content.zero?
|
90
|
-
unless value.count.zero? or value.without_content==value.count then
|
91
|
-
IndexLogger.info "time needed: #{(value.time_needed*1000).to_i} ms"
|
92
|
-
IndexLogger.info "avg. time needed: #{(value.time_needed*1000/(value.count-value.without_content)).to_i} ms/file"
|
93
|
-
end
|
94
|
-
end
|
95
|
-
total_count = counters.values.inject(0) {|sum,count| sum + count.count}
|
96
|
-
total_size = counters.values.inject(0) {|sum,count| sum + count.size}
|
97
|
-
total_without_content = counters.values.inject(0) {|sum,count| sum + count.without_content}
|
98
|
-
total_time_needed = counters.values.inject(0) {|sum,count| sum + count.time_needed}
|
99
|
-
IndexLogger.info "\ntotal files indexed: #{total_count} (#{total_size} bytes)"
|
100
|
-
IndexLogger.info("total files without_content: #{total_without_content}") unless total_without_content.zero?
|
101
|
-
unless total_count.zero? or total_count==total_without_content then
|
102
|
-
IndexLogger.info "total time needed: #{(total_time_needed*1000).to_i} ms"
|
103
|
-
IndexLogger.info "avg. time needed: #{(total_time_needed*1000/(total_count-total_without_content)).to_i} ms/file"
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def add_fields(index)
|
108
|
-
# Although not intuitively obvious, until I (Stuart Rackham) tokenized the file name, wildcard
|
109
|
-
# file name searches did not return all matching documents.
|
110
|
-
index.field_infos.add_field(:complete_path, :store => :yes, :index => :yes)
|
111
|
-
index.field_infos.add_field(:content, :store => :yes, :index => :yes)
|
112
|
-
index.field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
113
|
-
index.field_infos.add_field(:file, :store => :no, :index => :yes, :boost => 1.5)
|
114
|
-
index.field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
115
|
-
index.field_infos.add_field(:date, :store=>:yes, :index=>:yes)
|
116
|
-
index.field_infos.add_field(:probably_unique_id, :store=>:no, :index=>:yes)
|
117
|
-
end
|
@@ -1,75 +0,0 @@
|
|
1
|
-
require 'filter_DSL'
|
2
|
-
|
3
|
-
module PlainText
|
4
|
-
@@filters=[]
|
5
|
-
|
6
|
-
#returns every defined filter
|
7
|
-
def self.filters
|
8
|
-
@@filters
|
9
|
-
end
|
10
|
-
|
11
|
-
#returns every required dependency for every defined filter
|
12
|
-
def self.filter_dependencies
|
13
|
-
@@dependencies||=filters.collect{|filter| filter.dependencies}.flatten.compact.uniq.sort
|
14
|
-
end
|
15
|
-
|
16
|
-
#returns every supported file extensions
|
17
|
-
def self.supported_extensions
|
18
|
-
@@supported_exts||=filters.collect{|filter| filter.exts}.flatten.compact.uniq
|
19
|
-
end
|
20
|
-
|
21
|
-
#finds which filter should be used for a given file, according to its extension
|
22
|
-
def self.find_filter_for(filename)
|
23
|
-
ext=File.ext_as_sym(filename)
|
24
|
-
filter=filters.find{|filter| filter.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
|
25
|
-
filter.source=filename
|
26
|
-
filter
|
27
|
-
end
|
28
|
-
|
29
|
-
#launches filter on given file and outputs plain text result
|
30
|
-
def self.extract_content_from(source)
|
31
|
-
find_filter_for(source).extract_content
|
32
|
-
end
|
33
|
-
|
34
|
-
|
35
|
-
class Filter
|
36
|
-
attr_accessor :source
|
37
|
-
|
38
|
-
#parses command in order to know which programs are needed.
|
39
|
-
#rspec will then check that every dependecy is installed on the system
|
40
|
-
def dependencies
|
41
|
-
if command.is_a?(String) then
|
42
|
-
command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
|
43
|
-
else
|
44
|
-
@dependencies
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
#Conversion part
|
49
|
-
|
50
|
-
#destination method can be used by some conversion command that cannot output to stdout (example?)
|
51
|
-
#a file containing plain text result will first be written by command, and then be read by extract_content.
|
52
|
-
def destination
|
53
|
-
require 'tmpdir'
|
54
|
-
@@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}")
|
55
|
-
end
|
56
|
-
|
57
|
-
#Replaces generic command with specific source and destination (if specified) files
|
58
|
-
def specific_command
|
59
|
-
command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"')
|
60
|
-
end
|
61
|
-
|
62
|
-
def extract_content
|
63
|
-
if command.is_a?(String) then
|
64
|
-
if command.include?('DESTINATION') then
|
65
|
-
system(specific_command)
|
66
|
-
File.read_and_remove(destination)
|
67
|
-
else
|
68
|
-
IO.popen(specific_command){|io| io.read}
|
69
|
-
end
|
70
|
-
else
|
71
|
-
command.call(source)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
@@ -1,77 +0,0 @@
|
|
1
|
-
#Module used to define Filters with DSL
|
2
|
-
#For example, to convert "Microsoft Office Word document" to plain text
|
3
|
-
# PlainText.extract {
|
4
|
-
# from :doc, :dot
|
5
|
-
# as "application/msword"
|
6
|
-
# aka "Microsoft Office Word document"
|
7
|
-
# with "antiword SOURCE > DESTINATION 2>/dev/null" => :on_linux, "some other command" => :on_windows
|
8
|
-
# which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
|
9
|
-
# }
|
10
|
-
module PlainText
|
11
|
-
#defines a new Filter with DSL
|
12
|
-
def self.extract(&block)
|
13
|
-
filter = Filter.new
|
14
|
-
filter.instance_eval(&block)
|
15
|
-
@@filters<<filter
|
16
|
-
MimeType.add(filter.exts,filter.mime_name)
|
17
|
-
end
|
18
|
-
|
19
|
-
#defined by DSL described in PlainText
|
20
|
-
class Filter
|
21
|
-
attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples
|
22
|
-
|
23
|
-
def initialize
|
24
|
-
@content_and_file_examples=[]
|
25
|
-
end
|
26
|
-
|
27
|
-
def from(*exts)
|
28
|
-
@exts=exts
|
29
|
-
end
|
30
|
-
|
31
|
-
def as(mime_name)
|
32
|
-
@mime_name=mime_name
|
33
|
-
end
|
34
|
-
|
35
|
-
def aka(description)
|
36
|
-
@description=description
|
37
|
-
end
|
38
|
-
|
39
|
-
def which_requires(*dependencies)
|
40
|
-
@dependencies=dependencies
|
41
|
-
end
|
42
|
-
|
43
|
-
#used by rspec to test filters:
|
44
|
-
# which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
45
|
-
# or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
|
46
|
-
#
|
47
|
-
#this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
|
48
|
-
#and if plain text output from the filter applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
|
49
|
-
def which_should_for_example_extract(content, file)
|
50
|
-
@content_and_file_examples << [content,file[:from]]
|
51
|
-
end
|
52
|
-
|
53
|
-
#it allows to define specs in this way:
|
54
|
-
# which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
|
55
|
-
# or_extract 'text inside!', :from => 'crossed.txt'
|
56
|
-
alias_method :or_extract, :which_should_for_example_extract
|
57
|
-
|
58
|
-
def with(command_as_hash_or_string=nil,&block)
|
59
|
-
platform=case RUBY_PLATFORM
|
60
|
-
when /linux/
|
61
|
-
:on_linux
|
62
|
-
when /win/
|
63
|
-
:on_windows
|
64
|
-
end
|
65
|
-
@command=case command_as_hash_or_string
|
66
|
-
when String
|
67
|
-
command_as_hash_or_string
|
68
|
-
when Hash
|
69
|
-
#dup must be used, otherwise @command gets frozen. No idea why though....
|
70
|
-
command_as_hash_or_string.invert[platform].dup
|
71
|
-
else
|
72
|
-
block || raise("No command defined for this filter: #{description}")
|
73
|
-
end
|
74
|
-
@command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
-
|
3
|
-
describe "Filters" do
|
4
|
-
before(:all) do
|
5
|
-
Finder.ensure_that_index_exists_on_disk
|
6
|
-
end
|
7
|
-
|
8
|
-
PlainText.filters.each{|filter|
|
9
|
-
filter.exts.each{|ext|
|
10
|
-
should_extract= "should be able to extract content from #{filter.description} (.#{ext})"
|
11
|
-
content_and_file_examples_for_this_ext=filter.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}
|
12
|
-
unless content_and_file_examples_for_this_ext.empty? then
|
13
|
-
it should_extract do
|
14
|
-
content_and_file_examples_for_this_ext.each{|content_example,file_example|
|
15
|
-
finder=Finder.new(content_example)
|
16
|
-
finder.execute!
|
17
|
-
matching_documents=finder.matching_documents
|
18
|
-
matching_documents_filenames=matching_documents.collect{|d| d.filename}
|
19
|
-
matching_documents_filenames.should include(file_example)
|
20
|
-
}
|
21
|
-
end
|
22
|
-
else
|
23
|
-
## It means that the spec for this extension file is "Not yet implemented"!
|
24
|
-
## add this line to the corresponding filter in lib/filters:
|
25
|
-
# which_should_for_example_extract 'some content', :from => 'a file you could add in spec/test_dirs/indexed/'
|
26
|
-
it should_extract
|
27
|
-
end
|
28
|
-
}
|
29
|
-
}
|
30
|
-
end
|