code_zauker 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +3 -3
- data/bin/czindexer +0 -0
- data/bin/czsearch +22 -3
- data/code_zauker.gemspec +3 -1
- data/doc/CodeZauker.html +10 -8
- data/doc/CodeZauker/FileScanner.html +173 -171
- data/doc/CodeZauker/Util.html +184 -9
- data/doc/Grep.html +5 -3
- data/doc/_index.html +4 -4
- data/doc/frames.html +1 -1
- data/doc/index.html +4 -4
- data/doc/js/full_list.js +6 -0
- data/doc/method_list.html +17 -1
- data/doc/top-level-namespace.html +5 -3
- data/lib/code_zauker.rb +61 -25
- data/lib/code_zauker/constants.rb +3 -3
- data/lib/code_zauker/version.rb +1 -1
- data/readme.org +3 -17
- data/test/fixture/simple_test.pdf +0 -0
- data/test/test_pdf_indexing.rb +38 -0
- metadata +25 -13
- data/CHANGELOG.org +0 -15
data/doc/CodeZauker/Util.html
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Class: CodeZauker::Util
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.4 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -108,6 +108,8 @@
|
|
108
108
|
|
109
109
|
|
110
110
|
|
111
|
+
|
112
|
+
|
111
113
|
|
112
114
|
<h2>
|
113
115
|
Instance Method Summary
|
@@ -141,6 +143,50 @@ This code try to "guess" the right encoding switching to ISO-8859-1 if
|
|
141
143
|
UTF-8 is not valid.</p>
|
142
144
|
</div></span>
|
143
145
|
|
146
|
+
</li>
|
147
|
+
|
148
|
+
|
149
|
+
<li class="public ">
|
150
|
+
<span class="summary_signature">
|
151
|
+
|
152
|
+
<a href="#get_lines-instance_method" title="#get_lines (instance method)">- (Object) <strong>get_lines</strong>(filename) </a>
|
153
|
+
|
154
|
+
|
155
|
+
|
156
|
+
</span>
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
<span class="summary_desc"><div class='inline'>
|
166
|
+
<p>Obtain lines from a filename It works even with pdf files.</p>
|
167
|
+
</div></span>
|
168
|
+
|
169
|
+
</li>
|
170
|
+
|
171
|
+
|
172
|
+
<li class="public ">
|
173
|
+
<span class="summary_signature">
|
174
|
+
|
175
|
+
<a href="#is_pdf%3F-instance_method" title="#is_pdf? (instance method)">- (Boolean) <strong>is_pdf?</strong>(filename) </a>
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
</span>
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
<span class="summary_desc"><div class='inline'></div></span>
|
189
|
+
|
144
190
|
</li>
|
145
191
|
|
146
192
|
|
@@ -207,7 +253,6 @@ interpreted as a UTF-8 whereas it is a ISO-8859 windows code.</p>
|
|
207
253
|
<pre class="lines">
|
208
254
|
|
209
255
|
|
210
|
-
56
|
211
256
|
57
|
212
257
|
58
|
213
258
|
59
|
@@ -224,10 +269,11 @@ interpreted as a UTF-8 whereas it is a ISO-8859 windows code.</p>
|
|
224
269
|
70
|
225
270
|
71
|
226
271
|
72
|
227
|
-
73
|
272
|
+
73
|
273
|
+
74</pre>
|
228
274
|
</td>
|
229
275
|
<td>
|
230
|
-
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line
|
276
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 57</span>
|
231
277
|
|
232
278
|
<span class='kw'>def</span> <span class='id identifier rubyid_ensureUTF8'>ensureUTF8</span><span class='lparen'>(</span><span class='id identifier rubyid_untrusted_string'>untrusted_string</span><span class='rparen'>)</span>
|
233
279
|
<span class='kw'>if</span> <span class='id identifier rubyid_untrusted_string'>untrusted_string</span><span class='period'>.</span><span class='id identifier rubyid_valid_encoding?'>valid_encoding?</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='op'>==</span><span class='kw'>false</span>
|
@@ -250,6 +296,135 @@ interpreted as a UTF-8 whereas it is a ISO-8859 windows code.</p>
|
|
250
296
|
</td>
|
251
297
|
</tr>
|
252
298
|
</table>
|
299
|
+
</div>
|
300
|
+
|
301
|
+
<div class="method_details ">
|
302
|
+
<p class="signature " id="get_lines-instance_method">
|
303
|
+
|
304
|
+
- (<tt>Object</tt>) <strong>get_lines</strong>(filename)
|
305
|
+
|
306
|
+
|
307
|
+
|
308
|
+
</p><div class="docstring">
|
309
|
+
<div class="discussion">
|
310
|
+
|
311
|
+
<p>Obtain lines from a filename It works even with pdf files</p>
|
312
|
+
|
313
|
+
|
314
|
+
</div>
|
315
|
+
</div>
|
316
|
+
<div class="tags">
|
317
|
+
|
318
|
+
|
319
|
+
</div><table class="source_code">
|
320
|
+
<tr>
|
321
|
+
<td>
|
322
|
+
<pre class="lines">
|
323
|
+
|
324
|
+
|
325
|
+
82
|
326
|
+
83
|
327
|
+
84
|
328
|
+
85
|
329
|
+
86
|
330
|
+
87
|
331
|
+
88
|
332
|
+
89
|
333
|
+
90
|
334
|
+
91
|
335
|
+
92
|
336
|
+
93
|
337
|
+
94
|
338
|
+
95
|
339
|
+
96
|
340
|
+
97
|
341
|
+
98
|
342
|
+
99
|
343
|
+
100
|
344
|
+
101
|
345
|
+
102
|
346
|
+
103
|
347
|
+
104</pre>
|
348
|
+
</td>
|
349
|
+
<td>
|
350
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 82</span>
|
351
|
+
|
352
|
+
<span class='kw'>def</span> <span class='id identifier rubyid_get_lines'>get_lines</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='rparen'>)</span>
|
353
|
+
<span class='id identifier rubyid_lines'>lines</span><span class='op'>=</span><span class='lbracket'>[</span><span class='rbracket'>]</span>
|
354
|
+
<span class='kw'>if</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_is_pdf?'>is_pdf?</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='rparen'>)</span>
|
355
|
+
<span class='comment'># => enable pdf processing....
|
356
|
+
</span> <span class='comment'>#puts "PDF..."
|
357
|
+
</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_open'>open</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>rb</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_io'>io</span><span class='op'>|</span>
|
358
|
+
<span class='id identifier rubyid_reader'>reader</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_io'>io</span><span class='rparen'>)</span>
|
359
|
+
<span class='comment'>#puts "PDF Scanning...#{reader.info}"
|
360
|
+
</span> <span class='id identifier rubyid_reader'>reader</span><span class='period'>.</span><span class='id identifier rubyid_pages'>pages</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_page'>page</span><span class='op'>|</span>
|
361
|
+
<span class='id identifier rubyid_linesToTrim'>linesToTrim</span><span class='op'>=</span><span class='id identifier rubyid_page'>page</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_split'>split</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>\n</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
362
|
+
<span class='id identifier rubyid_linesToTrim'>linesToTrim</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_l'>l</span><span class='op'>|</span>
|
363
|
+
<span class='id identifier rubyid_lines'>lines</span><span class='period'>.</span><span class='id identifier rubyid_push'>push</span><span class='lparen'>(</span><span class='id identifier rubyid_l'>l</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='rparen'>)</span>
|
364
|
+
<span class='kw'>end</span>
|
365
|
+
<span class='kw'>end</span>
|
366
|
+
<span class='comment'>#puts "PDF Lines:#{lines.length}"
|
367
|
+
</span> <span class='kw'>end</span>
|
368
|
+
<span class='kw'>else</span>
|
369
|
+
<span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_open'>open</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='comma'>,</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>r</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_f'>f</span><span class='op'>|</span>
|
370
|
+
<span class='id identifier rubyid_lines'>lines</span><span class='op'>=</span><span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_readlines'>readlines</span><span class='lparen'>(</span><span class='rparen'>)</span>
|
371
|
+
<span class='rbrace'>}</span>
|
372
|
+
<span class='kw'>end</span>
|
373
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_lines'>lines</span>
|
374
|
+
<span class='kw'>end</span></pre>
|
375
|
+
</td>
|
376
|
+
</tr>
|
377
|
+
</table>
|
378
|
+
</div>
|
379
|
+
|
380
|
+
<div class="method_details ">
|
381
|
+
<p class="signature " id="is_pdf?-instance_method">
|
382
|
+
|
383
|
+
- (<tt>Boolean</tt>) <strong>is_pdf?</strong>(filename)
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
</p><div class="docstring">
|
388
|
+
<div class="discussion">
|
389
|
+
|
390
|
+
|
391
|
+
</div>
|
392
|
+
</div>
|
393
|
+
<div class="tags">
|
394
|
+
|
395
|
+
<h3>Returns:</h3>
|
396
|
+
<ul class="return">
|
397
|
+
|
398
|
+
<li>
|
399
|
+
|
400
|
+
|
401
|
+
<span class='type'>(<tt>Boolean</tt>)</span>
|
402
|
+
|
403
|
+
|
404
|
+
|
405
|
+
</li>
|
406
|
+
|
407
|
+
</ul>
|
408
|
+
|
409
|
+
</div><table class="source_code">
|
410
|
+
<tr>
|
411
|
+
<td>
|
412
|
+
<pre class="lines">
|
413
|
+
|
414
|
+
|
415
|
+
76
|
416
|
+
77
|
417
|
+
78</pre>
|
418
|
+
</td>
|
419
|
+
<td>
|
420
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 76</span>
|
421
|
+
|
422
|
+
<span class='kw'>def</span> <span class='id identifier rubyid_is_pdf?'>is_pdf?</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='rparen'>)</span>
|
423
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_filename'>filename</span><span class='period'>.</span><span class='id identifier rubyid_downcase'>downcase</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_end_with?'>end_with?</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>.pdf</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
424
|
+
<span class='kw'>end</span></pre>
|
425
|
+
</td>
|
426
|
+
</tr>
|
427
|
+
</table>
|
253
428
|
</div>
|
254
429
|
|
255
430
|
<div class="method_details ">
|
@@ -277,7 +452,6 @@ TODO: Very bad implementation, need improvements</p>
|
|
277
452
|
<pre class="lines">
|
278
453
|
|
279
454
|
|
280
|
-
19
|
281
455
|
20
|
282
456
|
21
|
283
457
|
22
|
@@ -306,10 +480,11 @@ TODO: Very bad implementation, need improvements</p>
|
|
306
480
|
45
|
307
481
|
46
|
308
482
|
47
|
309
|
-
48
|
483
|
+
48
|
484
|
+
49</pre>
|
310
485
|
</td>
|
311
486
|
<td>
|
312
|
-
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line
|
487
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 20</span>
|
313
488
|
|
314
489
|
<span class='kw'>def</span> <span class='id identifier rubyid_mixCase'>mixCase</span><span class='lparen'>(</span><span class='id identifier rubyid_trigram'>trigram</span><span class='rparen'>)</span>
|
315
490
|
<span class='id identifier rubyid_caseMixedElements'>caseMixedElements</span><span class='op'>=</span><span class='lbracket'>[</span><span class='rbracket'>]</span>
|
@@ -351,9 +526,9 @@ TODO: Very bad implementation, need improvements</p>
|
|
351
526
|
</div>
|
352
527
|
|
353
528
|
<div id="footer">
|
354
|
-
Generated on
|
529
|
+
Generated on Sun Feb 12 19:16:26 2012 by
|
355
530
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
356
|
-
0.7.
|
531
|
+
0.7.5 (ruby-1.9.3).
|
357
532
|
</div>
|
358
533
|
|
359
534
|
</body>
|
data/doc/Grep.html
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Module: Grep
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.4 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -82,6 +82,8 @@
|
|
82
82
|
|
83
83
|
|
84
84
|
|
85
|
+
|
86
|
+
|
85
87
|
|
86
88
|
<h2>
|
87
89
|
Instance Method Summary
|
@@ -335,9 +337,9 @@ will be printed.</p>
|
|
335
337
|
</div>
|
336
338
|
|
337
339
|
<div id="footer">
|
338
|
-
Generated on
|
340
|
+
Generated on Sun Feb 12 19:16:27 2012 by
|
339
341
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
340
|
-
0.7.
|
342
|
+
0.7.5 (ruby-1.9.3).
|
341
343
|
</div>
|
342
344
|
|
343
345
|
</body>
|
data/doc/_index.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
6
6
|
<title>
|
7
|
-
Code Zauker 0.0.
|
7
|
+
Code Zauker 0.0.4 Documentation
|
8
8
|
|
9
9
|
</title>
|
10
10
|
|
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
<iframe id="search_frame"></iframe>
|
54
54
|
|
55
|
-
<div id="content"><h1 class="noborder title">Code Zauker 0.0.
|
55
|
+
<div id="content"><h1 class="noborder title">Code Zauker 0.0.4 Documentation</h1>
|
56
56
|
<div id="listing">
|
57
57
|
<h1 class="alphaindex">Alphabetic Index</h1>
|
58
58
|
|
@@ -131,9 +131,9 @@
|
|
131
131
|
</div>
|
132
132
|
|
133
133
|
<div id="footer">
|
134
|
-
Generated on
|
134
|
+
Generated on Sun Feb 12 19:16:25 2012 by
|
135
135
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
136
|
-
0.7.
|
136
|
+
0.7.5 (ruby-1.9.3).
|
137
137
|
</div>
|
138
138
|
|
139
139
|
</body>
|
data/doc/frames.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
5
|
<head>
|
6
6
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
7
|
-
<title>Code Zauker 0.0.
|
7
|
+
<title>Code Zauker 0.0.4 Documentation</title>
|
8
8
|
</head>
|
9
9
|
<frameset cols="20%,*">
|
10
10
|
<frame name="list" src="class_list.html" />
|
data/doc/index.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
6
6
|
<title>
|
7
|
-
Code Zauker 0.0.
|
7
|
+
Code Zauker 0.0.4 Documentation
|
8
8
|
|
9
9
|
</title>
|
10
10
|
|
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
<iframe id="search_frame"></iframe>
|
54
54
|
|
55
|
-
<div id="content"><h1 class="noborder title">Code Zauker 0.0.
|
55
|
+
<div id="content"><h1 class="noborder title">Code Zauker 0.0.4 Documentation</h1>
|
56
56
|
<div id="listing">
|
57
57
|
<h1 class="alphaindex">Alphabetic Index</h1>
|
58
58
|
|
@@ -131,9 +131,9 @@
|
|
131
131
|
</div>
|
132
132
|
|
133
133
|
<div id="footer">
|
134
|
-
Generated on
|
134
|
+
Generated on Sun Feb 12 19:16:25 2012 by
|
135
135
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
136
|
-
0.7.
|
136
|
+
0.7.5 (ruby-1.9.3).
|
137
137
|
</div>
|
138
138
|
|
139
139
|
</body>
|
data/doc/js/full_list.js
CHANGED
@@ -4,6 +4,9 @@ var searchCache = [];
|
|
4
4
|
var searchString = '';
|
5
5
|
var regexSearchString = '';
|
6
6
|
var caseSensitiveMatch = false;
|
7
|
+
var ignoreKeyCodeMin = 8;
|
8
|
+
var ignoreKeyCodeMax = 46;
|
9
|
+
var commandKey = 91;
|
7
10
|
|
8
11
|
RegExp.escape = function(text) {
|
9
12
|
return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
|
@@ -19,6 +22,9 @@ function fullListSearch() {
|
|
19
22
|
});
|
20
23
|
|
21
24
|
$('#search input').keyup(function() {
|
25
|
+
if ((event.keyCode > ignoreKeyCodeMin && event.keyCode < ignoreKeyCodeMax)
|
26
|
+
|| event.keyCode == commandKey)
|
27
|
+
return;
|
22
28
|
searchString = this.value;
|
23
29
|
caseSensitiveMatch = searchString.match(/[A-Z]/) != null;
|
24
30
|
regexSearchString = RegExp.escape(searchString);
|
data/doc/method_list.html
CHANGED
@@ -57,6 +57,14 @@
|
|
57
57
|
|
58
58
|
|
59
59
|
<li class="r1 ">
|
60
|
+
<span class='object_link'><a href="CodeZauker/Util.html#get_lines-instance_method" title="CodeZauker::Util#get_lines (method)">#get_lines</a></span>
|
61
|
+
|
62
|
+
<small>CodeZauker::Util</small>
|
63
|
+
|
64
|
+
</li>
|
65
|
+
|
66
|
+
|
67
|
+
<li class="r2 ">
|
60
68
|
<span class='object_link'><a href="Grep.html#grep-instance_method" title="Grep#grep (method)">#grep</a></span>
|
61
69
|
|
62
70
|
<small>Grep</small>
|
@@ -64,7 +72,7 @@
|
|
64
72
|
</li>
|
65
73
|
|
66
74
|
|
67
|
-
<li class="
|
75
|
+
<li class="r1 ">
|
68
76
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#initialize-instance_method" title="CodeZauker::FileScanner#initialize (method)">#initialize</a></span>
|
69
77
|
|
70
78
|
<small>CodeZauker::FileScanner</small>
|
@@ -72,6 +80,14 @@
|
|
72
80
|
</li>
|
73
81
|
|
74
82
|
|
83
|
+
<li class="r2 ">
|
84
|
+
<span class='object_link'><a href="CodeZauker/Util.html#is_pdf%3F-instance_method" title="CodeZauker::Util#is_pdf? (method)">#is_pdf?</a></span>
|
85
|
+
|
86
|
+
<small>CodeZauker::Util</small>
|
87
|
+
|
88
|
+
</li>
|
89
|
+
|
90
|
+
|
75
91
|
<li class="r1 ">
|
76
92
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#isearch-instance_method" title="CodeZauker::FileScanner#isearch (method)">#isearch</a></span>
|
77
93
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Top Level Namespace
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.4 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -91,12 +91,14 @@
|
|
91
91
|
|
92
92
|
|
93
93
|
|
94
|
+
|
95
|
+
|
94
96
|
</div>
|
95
97
|
|
96
98
|
<div id="footer">
|
97
|
-
Generated on
|
99
|
+
Generated on Sun Feb 12 19:16:27 2012 by
|
98
100
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
99
|
-
0.7.
|
101
|
+
0.7.5 (ruby-1.9.3).
|
100
102
|
</div>
|
101
103
|
|
102
104
|
</body>
|
data/lib/code_zauker.rb
CHANGED
@@ -4,6 +4,7 @@ require "code_zauker/constants"
|
|
4
4
|
require 'redis/connection/hiredis'
|
5
5
|
require 'redis'
|
6
6
|
require 'set'
|
7
|
+
require 'pdf/reader'
|
7
8
|
# This module implements a simple reverse indexer
|
8
9
|
# based on Redis
|
9
10
|
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
|
@@ -72,6 +73,37 @@ module CodeZauker
|
|
72
73
|
end
|
73
74
|
end
|
74
75
|
|
76
|
+
def is_pdf?(filename)
|
77
|
+
return filename.downcase().end_with?(".pdf")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Obtain lines from a filename
|
81
|
+
# It works even with pdf files
|
82
|
+
def get_lines(filename)
|
83
|
+
lines=[]
|
84
|
+
if self.is_pdf?(filename)
|
85
|
+
# => enable pdf processing....
|
86
|
+
#puts "PDF..."
|
87
|
+
File.open(filename, "rb") do |io|
|
88
|
+
reader = PDF::Reader.new(io)
|
89
|
+
#puts "PDF Scanning...#{reader.info}"
|
90
|
+
reader.pages.each do |page|
|
91
|
+
linesToTrim=page.text.split("\n")
|
92
|
+
linesToTrim.each do |l|
|
93
|
+
lines.push(l.strip())
|
94
|
+
end
|
95
|
+
end
|
96
|
+
#puts "PDF Lines:#{lines.length}"
|
97
|
+
end
|
98
|
+
else
|
99
|
+
File.open(filename,"r") { |f|
|
100
|
+
lines=f.readlines()
|
101
|
+
}
|
102
|
+
end
|
103
|
+
return lines
|
104
|
+
end
|
105
|
+
|
106
|
+
|
75
107
|
end
|
76
108
|
|
77
109
|
# Scan a file and push it inside redis...
|
@@ -128,7 +160,11 @@ module CodeZauker
|
|
128
160
|
|
129
161
|
def pushTrigramsSetRecoverable(s, fid, filename)
|
130
162
|
error=false
|
131
|
-
@redis.multi do
|
163
|
+
# @redis.multi do
|
164
|
+
# From 5.8
|
165
|
+
# to 7.6 Files per sec
|
166
|
+
# changing multi into pipielined
|
167
|
+
@redis.pipelined do
|
132
168
|
s.each do | trigram |
|
133
169
|
@redis.sadd "trigram:#{trigram}",fid
|
134
170
|
@redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
@@ -139,7 +175,7 @@ module CodeZauker
|
|
139
175
|
error=true
|
140
176
|
end
|
141
177
|
end
|
142
|
-
end # multi
|
178
|
+
end # multi/pipelined
|
143
179
|
return error
|
144
180
|
end
|
145
181
|
private :pushTrigramsSetRecoverable
|
@@ -169,31 +205,31 @@ module CodeZauker
|
|
169
205
|
# before sending it to redis. This avoid
|
170
206
|
# a lot of spourios work
|
171
207
|
s=Set.new
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
end
|
192
|
-
trigramScanned += 1
|
193
|
-
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
208
|
+
util=Util.new()
|
209
|
+
lines=util.get_lines(filename)
|
210
|
+
adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
|
211
|
+
|
212
|
+
lines.each do |lineNotUTF8|
|
213
|
+
l= util.ensureUTF8(lineNotUTF8)
|
214
|
+
# Split each line into 3-char chunks, and store in a redis set
|
215
|
+
i=0
|
216
|
+
for istart in 0...(l.length-GRAM_SIZE)
|
217
|
+
trigram = l[istart, GRAM_SIZE]
|
218
|
+
# Avoid storing the 3space guy enterely
|
219
|
+
if trigram==SPACE_GUY
|
220
|
+
next
|
221
|
+
end
|
222
|
+
# push the trigram to redis (highly optimized)
|
223
|
+
s.add(trigram)
|
224
|
+
if s.length > adaptiveSize
|
225
|
+
pushTrigramsSet(s,fid,filename)
|
226
|
+
s=Set.new()
|
194
227
|
end
|
228
|
+
trigramScanned += 1
|
229
|
+
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
195
230
|
end
|
196
|
-
|
231
|
+
end
|
232
|
+
|
197
233
|
|
198
234
|
if s.length > 0
|
199
235
|
pushTrigramsSet(s,fid,filename)
|