code_zauker 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +3 -3
- data/bin/czindexer +0 -0
- data/bin/czsearch +22 -3
- data/code_zauker.gemspec +3 -1
- data/doc/CodeZauker.html +10 -8
- data/doc/CodeZauker/FileScanner.html +173 -171
- data/doc/CodeZauker/Util.html +184 -9
- data/doc/Grep.html +5 -3
- data/doc/_index.html +4 -4
- data/doc/frames.html +1 -1
- data/doc/index.html +4 -4
- data/doc/js/full_list.js +6 -0
- data/doc/method_list.html +17 -1
- data/doc/top-level-namespace.html +5 -3
- data/lib/code_zauker.rb +61 -25
- data/lib/code_zauker/constants.rb +3 -3
- data/lib/code_zauker/version.rb +1 -1
- data/readme.org +3 -17
- data/test/fixture/simple_test.pdf +0 -0
- data/test/test_pdf_indexing.rb +38 -0
- metadata +25 -13
- data/CHANGELOG.org +0 -15
data/doc/CodeZauker/Util.html
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Class: CodeZauker::Util
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.4 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -108,6 +108,8 @@
|
|
108
108
|
|
109
109
|
|
110
110
|
|
111
|
+
|
112
|
+
|
111
113
|
|
112
114
|
<h2>
|
113
115
|
Instance Method Summary
|
@@ -141,6 +143,50 @@ This code try to "guess" the right encoding switching to ISO-8859-1 if
|
|
141
143
|
UTF-8 is not valid.</p>
|
142
144
|
</div></span>
|
143
145
|
|
146
|
+
</li>
|
147
|
+
|
148
|
+
|
149
|
+
<li class="public ">
|
150
|
+
<span class="summary_signature">
|
151
|
+
|
152
|
+
<a href="#get_lines-instance_method" title="#get_lines (instance method)">- (Object) <strong>get_lines</strong>(filename) </a>
|
153
|
+
|
154
|
+
|
155
|
+
|
156
|
+
</span>
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
<span class="summary_desc"><div class='inline'>
|
166
|
+
<p>Obtain lines from a filename It works even with pdf files.</p>
|
167
|
+
</div></span>
|
168
|
+
|
169
|
+
</li>
|
170
|
+
|
171
|
+
|
172
|
+
<li class="public ">
|
173
|
+
<span class="summary_signature">
|
174
|
+
|
175
|
+
<a href="#is_pdf%3F-instance_method" title="#is_pdf? (instance method)">- (Boolean) <strong>is_pdf?</strong>(filename) </a>
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
</span>
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
<span class="summary_desc"><div class='inline'></div></span>
|
189
|
+
|
144
190
|
</li>
|
145
191
|
|
146
192
|
|
@@ -207,7 +253,6 @@ interpreted as a UTF-8 whereas it is a ISO-8859 windows code.</p>
|
|
207
253
|
<pre class="lines">
|
208
254
|
|
209
255
|
|
210
|
-
56
|
211
256
|
57
|
212
257
|
58
|
213
258
|
59
|
@@ -224,10 +269,11 @@ interpreted as a UTF-8 whereas it is a ISO-8859 windows code.</p>
|
|
224
269
|
70
|
225
270
|
71
|
226
271
|
72
|
227
|
-
73
|
272
|
+
73
|
273
|
+
74</pre>
|
228
274
|
</td>
|
229
275
|
<td>
|
230
|
-
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line
|
276
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 57</span>
|
231
277
|
|
232
278
|
<span class='kw'>def</span> <span class='id identifier rubyid_ensureUTF8'>ensureUTF8</span><span class='lparen'>(</span><span class='id identifier rubyid_untrusted_string'>untrusted_string</span><span class='rparen'>)</span>
|
233
279
|
<span class='kw'>if</span> <span class='id identifier rubyid_untrusted_string'>untrusted_string</span><span class='period'>.</span><span class='id identifier rubyid_valid_encoding?'>valid_encoding?</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='op'>==</span><span class='kw'>false</span>
|
@@ -250,6 +296,135 @@ interpreted as a UTF-8 whereas it is a ISO-8859 windows code.</p>
|
|
250
296
|
</td>
|
251
297
|
</tr>
|
252
298
|
</table>
|
299
|
+
</div>
|
300
|
+
|
301
|
+
<div class="method_details ">
|
302
|
+
<p class="signature " id="get_lines-instance_method">
|
303
|
+
|
304
|
+
- (<tt>Object</tt>) <strong>get_lines</strong>(filename)
|
305
|
+
|
306
|
+
|
307
|
+
|
308
|
+
</p><div class="docstring">
|
309
|
+
<div class="discussion">
|
310
|
+
|
311
|
+
<p>Obtain lines from a filename It works even with pdf files</p>
|
312
|
+
|
313
|
+
|
314
|
+
</div>
|
315
|
+
</div>
|
316
|
+
<div class="tags">
|
317
|
+
|
318
|
+
|
319
|
+
</div><table class="source_code">
|
320
|
+
<tr>
|
321
|
+
<td>
|
322
|
+
<pre class="lines">
|
323
|
+
|
324
|
+
|
325
|
+
82
|
326
|
+
83
|
327
|
+
84
|
328
|
+
85
|
329
|
+
86
|
330
|
+
87
|
331
|
+
88
|
332
|
+
89
|
333
|
+
90
|
334
|
+
91
|
335
|
+
92
|
336
|
+
93
|
337
|
+
94
|
338
|
+
95
|
339
|
+
96
|
340
|
+
97
|
341
|
+
98
|
342
|
+
99
|
343
|
+
100
|
344
|
+
101
|
345
|
+
102
|
346
|
+
103
|
347
|
+
104</pre>
|
348
|
+
</td>
|
349
|
+
<td>
|
350
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 82</span>
|
351
|
+
|
352
|
+
<span class='kw'>def</span> <span class='id identifier rubyid_get_lines'>get_lines</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='rparen'>)</span>
|
353
|
+
<span class='id identifier rubyid_lines'>lines</span><span class='op'>=</span><span class='lbracket'>[</span><span class='rbracket'>]</span>
|
354
|
+
<span class='kw'>if</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_is_pdf?'>is_pdf?</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='rparen'>)</span>
|
355
|
+
<span class='comment'># => enable pdf processing....
|
356
|
+
</span> <span class='comment'>#puts "PDF..."
|
357
|
+
</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_open'>open</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>rb</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_io'>io</span><span class='op'>|</span>
|
358
|
+
<span class='id identifier rubyid_reader'>reader</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_io'>io</span><span class='rparen'>)</span>
|
359
|
+
<span class='comment'>#puts "PDF Scanning...#{reader.info}"
|
360
|
+
</span> <span class='id identifier rubyid_reader'>reader</span><span class='period'>.</span><span class='id identifier rubyid_pages'>pages</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_page'>page</span><span class='op'>|</span>
|
361
|
+
<span class='id identifier rubyid_linesToTrim'>linesToTrim</span><span class='op'>=</span><span class='id identifier rubyid_page'>page</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_split'>split</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>\n</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
362
|
+
<span class='id identifier rubyid_linesToTrim'>linesToTrim</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_l'>l</span><span class='op'>|</span>
|
363
|
+
<span class='id identifier rubyid_lines'>lines</span><span class='period'>.</span><span class='id identifier rubyid_push'>push</span><span class='lparen'>(</span><span class='id identifier rubyid_l'>l</span><span class='period'>.</span><span class='id identifier rubyid_strip'>strip</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='rparen'>)</span>
|
364
|
+
<span class='kw'>end</span>
|
365
|
+
<span class='kw'>end</span>
|
366
|
+
<span class='comment'>#puts "PDF Lines:#{lines.length}"
|
367
|
+
</span> <span class='kw'>end</span>
|
368
|
+
<span class='kw'>else</span>
|
369
|
+
<span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_open'>open</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='comma'>,</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>r</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_f'>f</span><span class='op'>|</span>
|
370
|
+
<span class='id identifier rubyid_lines'>lines</span><span class='op'>=</span><span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_readlines'>readlines</span><span class='lparen'>(</span><span class='rparen'>)</span>
|
371
|
+
<span class='rbrace'>}</span>
|
372
|
+
<span class='kw'>end</span>
|
373
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_lines'>lines</span>
|
374
|
+
<span class='kw'>end</span></pre>
|
375
|
+
</td>
|
376
|
+
</tr>
|
377
|
+
</table>
|
378
|
+
</div>
|
379
|
+
|
380
|
+
<div class="method_details ">
|
381
|
+
<p class="signature " id="is_pdf?-instance_method">
|
382
|
+
|
383
|
+
- (<tt>Boolean</tt>) <strong>is_pdf?</strong>(filename)
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
</p><div class="docstring">
|
388
|
+
<div class="discussion">
|
389
|
+
|
390
|
+
|
391
|
+
</div>
|
392
|
+
</div>
|
393
|
+
<div class="tags">
|
394
|
+
|
395
|
+
<h3>Returns:</h3>
|
396
|
+
<ul class="return">
|
397
|
+
|
398
|
+
<li>
|
399
|
+
|
400
|
+
|
401
|
+
<span class='type'>(<tt>Boolean</tt>)</span>
|
402
|
+
|
403
|
+
|
404
|
+
|
405
|
+
</li>
|
406
|
+
|
407
|
+
</ul>
|
408
|
+
|
409
|
+
</div><table class="source_code">
|
410
|
+
<tr>
|
411
|
+
<td>
|
412
|
+
<pre class="lines">
|
413
|
+
|
414
|
+
|
415
|
+
76
|
416
|
+
77
|
417
|
+
78</pre>
|
418
|
+
</td>
|
419
|
+
<td>
|
420
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 76</span>
|
421
|
+
|
422
|
+
<span class='kw'>def</span> <span class='id identifier rubyid_is_pdf?'>is_pdf?</span><span class='lparen'>(</span><span class='id identifier rubyid_filename'>filename</span><span class='rparen'>)</span>
|
423
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_filename'>filename</span><span class='period'>.</span><span class='id identifier rubyid_downcase'>downcase</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_end_with?'>end_with?</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>.pdf</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
424
|
+
<span class='kw'>end</span></pre>
|
425
|
+
</td>
|
426
|
+
</tr>
|
427
|
+
</table>
|
253
428
|
</div>
|
254
429
|
|
255
430
|
<div class="method_details ">
|
@@ -277,7 +452,6 @@ TODO: Very bad implementation, need improvements</p>
|
|
277
452
|
<pre class="lines">
|
278
453
|
|
279
454
|
|
280
|
-
19
|
281
455
|
20
|
282
456
|
21
|
283
457
|
22
|
@@ -306,10 +480,11 @@ TODO: Very bad implementation, need improvements</p>
|
|
306
480
|
45
|
307
481
|
46
|
308
482
|
47
|
309
|
-
48
|
483
|
+
48
|
484
|
+
49</pre>
|
310
485
|
</td>
|
311
486
|
<td>
|
312
|
-
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line
|
487
|
+
<pre class="code"><span class="info file"># File 'lib/code_zauker.rb', line 20</span>
|
313
488
|
|
314
489
|
<span class='kw'>def</span> <span class='id identifier rubyid_mixCase'>mixCase</span><span class='lparen'>(</span><span class='id identifier rubyid_trigram'>trigram</span><span class='rparen'>)</span>
|
315
490
|
<span class='id identifier rubyid_caseMixedElements'>caseMixedElements</span><span class='op'>=</span><span class='lbracket'>[</span><span class='rbracket'>]</span>
|
@@ -351,9 +526,9 @@ TODO: Very bad implementation, need improvements</p>
|
|
351
526
|
</div>
|
352
527
|
|
353
528
|
<div id="footer">
|
354
|
-
Generated on
|
529
|
+
Generated on Sun Feb 12 19:16:26 2012 by
|
355
530
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
356
|
-
0.7.
|
531
|
+
0.7.5 (ruby-1.9.3).
|
357
532
|
</div>
|
358
533
|
|
359
534
|
</body>
|
data/doc/Grep.html
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Module: Grep
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.4 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -82,6 +82,8 @@
|
|
82
82
|
|
83
83
|
|
84
84
|
|
85
|
+
|
86
|
+
|
85
87
|
|
86
88
|
<h2>
|
87
89
|
Instance Method Summary
|
@@ -335,9 +337,9 @@ will be printed.</p>
|
|
335
337
|
</div>
|
336
338
|
|
337
339
|
<div id="footer">
|
338
|
-
Generated on
|
340
|
+
Generated on Sun Feb 12 19:16:27 2012 by
|
339
341
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
340
|
-
0.7.
|
342
|
+
0.7.5 (ruby-1.9.3).
|
341
343
|
</div>
|
342
344
|
|
343
345
|
</body>
|
data/doc/_index.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
6
6
|
<title>
|
7
|
-
Code Zauker 0.0.
|
7
|
+
Code Zauker 0.0.4 Documentation
|
8
8
|
|
9
9
|
</title>
|
10
10
|
|
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
<iframe id="search_frame"></iframe>
|
54
54
|
|
55
|
-
<div id="content"><h1 class="noborder title">Code Zauker 0.0.
|
55
|
+
<div id="content"><h1 class="noborder title">Code Zauker 0.0.4 Documentation</h1>
|
56
56
|
<div id="listing">
|
57
57
|
<h1 class="alphaindex">Alphabetic Index</h1>
|
58
58
|
|
@@ -131,9 +131,9 @@
|
|
131
131
|
</div>
|
132
132
|
|
133
133
|
<div id="footer">
|
134
|
-
Generated on
|
134
|
+
Generated on Sun Feb 12 19:16:25 2012 by
|
135
135
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
136
|
-
0.7.
|
136
|
+
0.7.5 (ruby-1.9.3).
|
137
137
|
</div>
|
138
138
|
|
139
139
|
</body>
|
data/doc/frames.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
5
|
<head>
|
6
6
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
7
|
-
<title>Code Zauker 0.0.
|
7
|
+
<title>Code Zauker 0.0.4 Documentation</title>
|
8
8
|
</head>
|
9
9
|
<frameset cols="20%,*">
|
10
10
|
<frame name="list" src="class_list.html" />
|
data/doc/index.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
6
6
|
<title>
|
7
|
-
Code Zauker 0.0.
|
7
|
+
Code Zauker 0.0.4 Documentation
|
8
8
|
|
9
9
|
</title>
|
10
10
|
|
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
<iframe id="search_frame"></iframe>
|
54
54
|
|
55
|
-
<div id="content"><h1 class="noborder title">Code Zauker 0.0.
|
55
|
+
<div id="content"><h1 class="noborder title">Code Zauker 0.0.4 Documentation</h1>
|
56
56
|
<div id="listing">
|
57
57
|
<h1 class="alphaindex">Alphabetic Index</h1>
|
58
58
|
|
@@ -131,9 +131,9 @@
|
|
131
131
|
</div>
|
132
132
|
|
133
133
|
<div id="footer">
|
134
|
-
Generated on
|
134
|
+
Generated on Sun Feb 12 19:16:25 2012 by
|
135
135
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
136
|
-
0.7.
|
136
|
+
0.7.5 (ruby-1.9.3).
|
137
137
|
</div>
|
138
138
|
|
139
139
|
</body>
|
data/doc/js/full_list.js
CHANGED
@@ -4,6 +4,9 @@ var searchCache = [];
|
|
4
4
|
var searchString = '';
|
5
5
|
var regexSearchString = '';
|
6
6
|
var caseSensitiveMatch = false;
|
7
|
+
var ignoreKeyCodeMin = 8;
|
8
|
+
var ignoreKeyCodeMax = 46;
|
9
|
+
var commandKey = 91;
|
7
10
|
|
8
11
|
RegExp.escape = function(text) {
|
9
12
|
return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
|
@@ -19,6 +22,9 @@ function fullListSearch() {
|
|
19
22
|
});
|
20
23
|
|
21
24
|
$('#search input').keyup(function() {
|
25
|
+
if ((event.keyCode > ignoreKeyCodeMin && event.keyCode < ignoreKeyCodeMax)
|
26
|
+
|| event.keyCode == commandKey)
|
27
|
+
return;
|
22
28
|
searchString = this.value;
|
23
29
|
caseSensitiveMatch = searchString.match(/[A-Z]/) != null;
|
24
30
|
regexSearchString = RegExp.escape(searchString);
|
data/doc/method_list.html
CHANGED
@@ -57,6 +57,14 @@
|
|
57
57
|
|
58
58
|
|
59
59
|
<li class="r1 ">
|
60
|
+
<span class='object_link'><a href="CodeZauker/Util.html#get_lines-instance_method" title="CodeZauker::Util#get_lines (method)">#get_lines</a></span>
|
61
|
+
|
62
|
+
<small>CodeZauker::Util</small>
|
63
|
+
|
64
|
+
</li>
|
65
|
+
|
66
|
+
|
67
|
+
<li class="r2 ">
|
60
68
|
<span class='object_link'><a href="Grep.html#grep-instance_method" title="Grep#grep (method)">#grep</a></span>
|
61
69
|
|
62
70
|
<small>Grep</small>
|
@@ -64,7 +72,7 @@
|
|
64
72
|
</li>
|
65
73
|
|
66
74
|
|
67
|
-
<li class="
|
75
|
+
<li class="r1 ">
|
68
76
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#initialize-instance_method" title="CodeZauker::FileScanner#initialize (method)">#initialize</a></span>
|
69
77
|
|
70
78
|
<small>CodeZauker::FileScanner</small>
|
@@ -72,6 +80,14 @@
|
|
72
80
|
</li>
|
73
81
|
|
74
82
|
|
83
|
+
<li class="r2 ">
|
84
|
+
<span class='object_link'><a href="CodeZauker/Util.html#is_pdf%3F-instance_method" title="CodeZauker::Util#is_pdf? (method)">#is_pdf?</a></span>
|
85
|
+
|
86
|
+
<small>CodeZauker::Util</small>
|
87
|
+
|
88
|
+
</li>
|
89
|
+
|
90
|
+
|
75
91
|
<li class="r1 ">
|
76
92
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#isearch-instance_method" title="CodeZauker::FileScanner#isearch (method)">#isearch</a></span>
|
77
93
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Top Level Namespace
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.4 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -91,12 +91,14 @@
|
|
91
91
|
|
92
92
|
|
93
93
|
|
94
|
+
|
95
|
+
|
94
96
|
</div>
|
95
97
|
|
96
98
|
<div id="footer">
|
97
|
-
Generated on
|
99
|
+
Generated on Sun Feb 12 19:16:27 2012 by
|
98
100
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
99
|
-
0.7.
|
101
|
+
0.7.5 (ruby-1.9.3).
|
100
102
|
</div>
|
101
103
|
|
102
104
|
</body>
|
data/lib/code_zauker.rb
CHANGED
@@ -4,6 +4,7 @@ require "code_zauker/constants"
|
|
4
4
|
require 'redis/connection/hiredis'
|
5
5
|
require 'redis'
|
6
6
|
require 'set'
|
7
|
+
require 'pdf/reader'
|
7
8
|
# This module implements a simple reverse indexer
|
8
9
|
# based on Redis
|
9
10
|
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
|
@@ -72,6 +73,37 @@ module CodeZauker
|
|
72
73
|
end
|
73
74
|
end
|
74
75
|
|
76
|
+
def is_pdf?(filename)
|
77
|
+
return filename.downcase().end_with?(".pdf")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Obtain lines from a filename
|
81
|
+
# It works even with pdf files
|
82
|
+
def get_lines(filename)
|
83
|
+
lines=[]
|
84
|
+
if self.is_pdf?(filename)
|
85
|
+
# => enable pdf processing....
|
86
|
+
#puts "PDF..."
|
87
|
+
File.open(filename, "rb") do |io|
|
88
|
+
reader = PDF::Reader.new(io)
|
89
|
+
#puts "PDF Scanning...#{reader.info}"
|
90
|
+
reader.pages.each do |page|
|
91
|
+
linesToTrim=page.text.split("\n")
|
92
|
+
linesToTrim.each do |l|
|
93
|
+
lines.push(l.strip())
|
94
|
+
end
|
95
|
+
end
|
96
|
+
#puts "PDF Lines:#{lines.length}"
|
97
|
+
end
|
98
|
+
else
|
99
|
+
File.open(filename,"r") { |f|
|
100
|
+
lines=f.readlines()
|
101
|
+
}
|
102
|
+
end
|
103
|
+
return lines
|
104
|
+
end
|
105
|
+
|
106
|
+
|
75
107
|
end
|
76
108
|
|
77
109
|
# Scan a file and push it inside redis...
|
@@ -128,7 +160,11 @@ module CodeZauker
|
|
128
160
|
|
129
161
|
def pushTrigramsSetRecoverable(s, fid, filename)
|
130
162
|
error=false
|
131
|
-
@redis.multi do
|
163
|
+
# @redis.multi do
|
164
|
+
# From 5.8
|
165
|
+
# to 7.6 Files per sec
|
166
|
+
# changing multi into pipielined
|
167
|
+
@redis.pipelined do
|
132
168
|
s.each do | trigram |
|
133
169
|
@redis.sadd "trigram:#{trigram}",fid
|
134
170
|
@redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
@@ -139,7 +175,7 @@ module CodeZauker
|
|
139
175
|
error=true
|
140
176
|
end
|
141
177
|
end
|
142
|
-
end # multi
|
178
|
+
end # multi/pipelined
|
143
179
|
return error
|
144
180
|
end
|
145
181
|
private :pushTrigramsSetRecoverable
|
@@ -169,31 +205,31 @@ module CodeZauker
|
|
169
205
|
# before sending it to redis. This avoid
|
170
206
|
# a lot of spourios work
|
171
207
|
s=Set.new
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
end
|
192
|
-
trigramScanned += 1
|
193
|
-
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
208
|
+
util=Util.new()
|
209
|
+
lines=util.get_lines(filename)
|
210
|
+
adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
|
211
|
+
|
212
|
+
lines.each do |lineNotUTF8|
|
213
|
+
l= util.ensureUTF8(lineNotUTF8)
|
214
|
+
# Split each line into 3-char chunks, and store in a redis set
|
215
|
+
i=0
|
216
|
+
for istart in 0...(l.length-GRAM_SIZE)
|
217
|
+
trigram = l[istart, GRAM_SIZE]
|
218
|
+
# Avoid storing the 3space guy enterely
|
219
|
+
if trigram==SPACE_GUY
|
220
|
+
next
|
221
|
+
end
|
222
|
+
# push the trigram to redis (highly optimized)
|
223
|
+
s.add(trigram)
|
224
|
+
if s.length > adaptiveSize
|
225
|
+
pushTrigramsSet(s,fid,filename)
|
226
|
+
s=Set.new()
|
194
227
|
end
|
228
|
+
trigramScanned += 1
|
229
|
+
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
195
230
|
end
|
196
|
-
|
231
|
+
end
|
232
|
+
|
197
233
|
|
198
234
|
if s.length > 0
|
199
235
|
pushTrigramsSet(s,fid,filename)
|