thomaspeklak-OfflineSearch 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ # temporary storage
2
+ # class to store crawled data before the date is written to a file
3
+ #
4
+ # options:
5
+ # datebase
6
+ # sqlite, mysql
7
+ # filesystem
8
+ #
9
+ # * $Author$
10
+ # * $Rev$
11
+ # * $LastChangedDate$
12
+
13
+ class Temporary_Storage
14
+ attr_reader :storage_handler
15
+
16
+ # initializes the storage handler
17
+ def initialize(mode)
18
+ @storage_handler=case
19
+ when mode=='sqlite': Sqlite.new('storage.db')
20
+ when mode=='memory': Memory.new
21
+ else
22
+ $logger.error("no appropriate stroage is selected\nvalid options:\n\tsqlite\n\tmemory")
23
+ exit
24
+ end
25
+ end
26
+
27
+ # stores file name, title and page rank.
28
+ # start pagerank with 1 to enable mutliplicatoin pagerank
29
+ def store_file(filename,title,pagerank=1)
30
+ @storage_handler.store_file(filename.to_s,title.to_s.gsub('"','"'),pagerank)
31
+ end
32
+
33
+ #stores the term and term rank
34
+ def store_term(term,rank)
35
+ @storage_handler.store_term(term,rank)
36
+ end
37
+
38
+ # stores an array of links
39
+ def store_link(links)
40
+ @storage_handler.store_link(links)
41
+ end
42
+
43
+ # returns a hash of a stored file
44
+ def get_file(filename)
45
+ f=@storage_handler.get_file(filename)
46
+ { 'filename'=>f[0],
47
+ 'titel'=>f[1],
48
+ 'pagerank'=>f[2] }
49
+ end
50
+
51
+ # returns a hash of stored files
52
+ def get_files
53
+ f=@storage_handler.get_files
54
+ end
55
+
56
+ # returns an array of links
57
+ def get_links
58
+ @storage_handler.get_links
59
+ end
60
+
61
+ # returns a hash of terms
62
+ def get_terms
63
+ @storage_handler.get_terms
64
+ end
65
+
66
+ # calculates the page rank
67
+ # the page rank equals the number of inbound links or if none 1
68
+ def calculate_pageranks_from_links
69
+ @storage_handler.calculate_pageranks_from_links
70
+ end
71
+
72
+ private
73
+
74
+ # implements a storage handler in the memory
75
+ class Memory
76
+ def initialize
77
+ @files = Hash.new
78
+ @terms = Terms.new
79
+ @links = Links.new
80
+ @current_doc = nil
81
+ end
82
+
83
+ #stores the file in the files hash
84
+ def store_file(filename,title,pagerank=1)
85
+ @files[filename] = @current_document = Document.new(filename, title, pagerank)
86
+ end
87
+
88
+ # stores a term in the terms class
89
+ def store_term(term,rank)
90
+ @terms.store(term, Term2Document.new(@current_document,rank))
91
+ end
92
+
93
+ # stores a link in the link class
94
+ def store_link(links)
95
+ @links.add(links)
96
+ end
97
+
98
+ def get_file(filename)
99
+
100
+ end
101
+
102
+ # returns the files hash
103
+ def get_files
104
+ @files
105
+ end
106
+
107
+ # returns a terms hash
108
+ def get_terms
109
+ @terms.get_all
110
+ end
111
+
112
+ # returns an array of links
113
+ def get_links
114
+ @links.get_all
115
+ end
116
+
117
+ # calculates the page rank
118
+ # the page rank equals the number of inbound links or if none 1
119
+ def calculate_pageranks_from_links
120
+ @links.get_all.each do |link, rank|
121
+ @files[link].page_rank=rank if @files.has_key?(link)
122
+ end
123
+ end
124
+
125
+ private
126
+
127
+ # represents a document. stores an internal id, the file name, title and page rank. all attributes are accessible
128
+ class Document
129
+ @@ID=0
130
+ attr_accessor :ID, :name, :title, :page_rank
131
+ def initialize(name,title,page_rank)
132
+ @ID= @@ID+=1
133
+ @name = name
134
+ @title = title
135
+ @page_rank = page_rank
136
+ end
137
+ end
138
+
139
+ # represents a hash of terms and their corresponding documents
140
+ class Terms
141
+ def initialize
142
+ @terms = Hash.new
143
+ end
144
+
145
+ # stores a term in the terms hash with the corresponding document or adds the document to a term if the term already exists in the hash
146
+ def store(term,term2document)
147
+ @terms.has_key?(term) ? @terms[term] << term2document : @terms[term]=[term2document]
148
+ end
149
+
150
+ # returns a term hash
151
+ def get_one(term)
152
+ @terms[term]
153
+ end
154
+
155
+ # returns the terms hash
156
+ def get_all
157
+ @terms
158
+ end
159
+ end
160
+
161
+ # represents a link from a term to a document. the link includes the semantic value of the term. all attributes are accesible
162
+ class Term2Document
163
+ attr_accessor :document, :rank
164
+ def initialize(document, rank)
165
+ @document = document
166
+ @rank = rank
167
+ end
168
+ end
169
+
170
+ # represents unique links of all indexed documents
171
+ class Links
172
+ def initialize
173
+ @links = Hash.new
174
+ end
175
+
176
+ # adds a link to the hash or increases the link value by one if the link already exists
177
+ def add(links)
178
+ links.each{ |link| @links.has_key?(link)? @links[link]+=1 : @links[link]=1 }
179
+ end
180
+
181
+ # returns all links
182
+ def get_all
183
+ @links
184
+ end
185
+ end
186
+ end
187
+
188
+ class Sqlite
189
+ def initialize(db)
190
+ require 'rubygems'
191
+ require 'sqlite3'
192
+ @current_file_id = nil
193
+ begin
194
+ File.delete(db)
195
+ rescue
196
+ end
197
+ @db = SQLite3::Database.new(db)
198
+ @db.type_translation = true
199
+ sql = "
200
+ create table files(
201
+ id integer not null primary key autoincrement,
202
+ filename varchar2(255),
203
+ title varchar2(255),
204
+ pagerank integer
205
+ );
206
+ create table terms(
207
+ id integer not null primary key autoincrement,
208
+ term varchar2(255) unique not null
209
+ );
210
+ create table files_terms(
211
+ file_id integer not null,
212
+ term_id integer not null,
213
+ rank integer not null
214
+ );
215
+ create table links(
216
+ link varchar2(255) not null primary key,
217
+ links_in integer
218
+ )
219
+ "
220
+ @db.execute_batch(sql)
221
+ end
222
+ def store_file(filename, title, pagerank)
223
+ @db.execute( "insert into files (filename, title, pagerank) values ( ?, ?, ? )", filename, title, pagerank)
224
+ @current_file_id = @db.last_insert_row_id()
225
+ end
226
+
227
+ def store_term(term, rank)
228
+ unless (term_id=@db.get_first_value('select id from terms where term = ?',term)) :
229
+ @db.execute("insert into terms (term) values (?)", term)
230
+ term_id=@db.last_insert_row_id()
231
+ end
232
+ @db.execute("insert into files_terms values (?,?,?)", @current_file_id,term_id,rank)
233
+ end
234
+
235
+ def store_link(links)
236
+ links.each do |link|
237
+ if(links_in = @db.get_first_value("select links_in from links where link = ?",link))
238
+ @db.execute("update links set links_in =? where link = ? ",links_in+1,link)
239
+ else
240
+ @db.execute("insert into links values (?,?)", link, 1)
241
+ end
242
+ end
243
+ end
244
+
245
+ def get_file(filename)
246
+ @db.get_first_row("select * from files where filename = ?", filename)
247
+ end
248
+
249
+ def get_files
250
+ @db.execute("select * from files f join files_term ft on f.id = ft.id")
251
+ end
252
+
253
+ def get_links
254
+ @db.execute("select * from links")
255
+ end
256
+ end
257
+ end
@@ -0,0 +1,290 @@
1
+ (function(){
2
+ String.prototype.doubleMetaphone = function(){
3
+ var primary = new Array, secondary = new Array, current = 0;
4
+ var original = this.toUpperCase()+' ', length = this.length, last = length -1;
5
+ if(/^GN|KN|PN|WR|PS$/.test(original.substr(0,2)))
6
+ current += 1;
7
+ if('X' == original.substr(0, 1)){
8
+ primary.push('S');
9
+ secondary.push('S');
10
+ current += 1;
11
+ }
12
+ //main loop
13
+ var resultset = new Array;
14
+ while ((primary.length<4 || secondary.length<4) && current <= length){
15
+ resultset = double_metaphone_lookup(original, current, length, last);
16
+ if(resultset[0]) primary.push(resultset[0]);
17
+ if(resultset[1]) secondary.push(resultset[1]);
18
+ current += resultset[2];
19
+ }
20
+ primary = primary.join('').substr(0,4);
21
+ secondary = secondary.join('').substr(0,4);
22
+ return [primary, (primary == secondary)? null : secondary];
23
+ };
24
+ String.prototype.slavo_germanic = function(){
25
+ return /W|K|CZ|WITZ/.test(this);
26
+ };
27
+ String.prototype.vowel = function() {
28
+ return /^[AEIOUY]$/.test(this);
29
+ };
30
+ var A='A',B='B',C='C',D='D',E='E',F='F',G='G',H='H',I='I',J='J',K='K',L='L',M='M',N='N',O='O',P='P',Q='Q',R='R',S='S',T='T',U='U',V='V',W='W',X='X',Y='Y',Z='Z';
31
+ function double_metaphone_lookup(str, pos, length, last){
32
+ var cl = str.charAt(pos); // current letter
33
+ switch (true){
34
+ case cl.vowel():
35
+ return (pos) ? [null, null, 1] : [A,A,1];
36
+ case cl==B:
37
+ return [P,P,(B == str.charAt(pos+1))?2:1];
38
+ case cl=='Ç':
39
+ return [S,S,1];
40
+ case cl==C:
41
+ if(pos>1 && !str.charAt(pos-2).vowel() && 'ACH' == str.substr(pos-1,3) && str.charAt(pos+2) != I && (str.charAt(pos+2) != E || /^(B|M)ACHER$/.test(str.substr(pos-2,6))))
42
+ return [K,K,2];
43
+ else if(!pos && 'CAESAR' == str.substr(pos,6))
44
+ return [S,S,2];
45
+ else if('CHIA' == str.substr(pos, 4))
46
+ return [K,K,2];
47
+ else if('CH' == str.substr(pos,2)){
48
+ if(pos && 'CHAE' == str.substr(pos,4))
49
+ return [K,X,2];
50
+ else if(!pos && ($.in_array(['HARAC', 'HARIS'],str.substr(pos+1,5)) || $.in_array(['HOR', 'HYM', 'HIA', 'HEM'],str.substr(pos+1,3))) && str.substr(0,5) != 'CHORE')
51
+ return [K,K,2];
52
+ else if($.in_array(['VON','VAN'],str.substr(0,4)) || 'SCH' == str.substr(0,3) || $.in_array(['ORCHES','ARCHIT','ORCHID'],str.substr(pos-2,6)) || /^T|S$/.test(str.charAt(pos+2)) || ((!pos || /^[AOUE]$/.test(str.charAt(pos-1))) && /^[LRNMBHFVW ]$/.test(str.charAt(pos+2))))
53
+ return [K,K,2];
54
+ else if(pos)
55
+ return [('MC' == str.substr(0,2))?K:X,K,2];
56
+ else return [X,X,2];
57
+ }
58
+ else if(Z == str.charAt(pos+1) && 'WI' != str.substr(pos-2,2))
59
+ return [S,X,2];
60
+ else if('CIA' == str.substr(pos+1,3))
61
+ return [X,X,3];
62
+ else if(C == str.charAt(pos+1) && 1 != pos && M != str.charAt(0)){
63
+ if(/^[IEH]$/.test(str.charAt(pos+2)) && 'HU' != str.substr(pos+2,2)){
64
+ if ((1 == pos && A == str.charAt(pos-1)) || /^UCCE(E|S)$/.test(str.substr(pos-1,5)))
65
+ return ['KS','KS',3];
66
+ else
67
+ return [X,X,3];
68
+ }
69
+ else
70
+ return [K,K,2];
71
+ }
72
+ else if(/^[KGQ]$/.test(str.charAt(pos+1)))
73
+ return [K,K,2];
74
+ else if(/^[IEY]$/.test(str.charAt(pos+1)))
75
+ return [S,(/^I(O|E|A)$/.test(str.substr(pos+1, 2)) ? X : S), 2];
76
+ else{
77
+ if(/^ (C|Q|G)$/.test(str.substr(pos+1,2)))
78
+ return [K,K,3];
79
+ else
80
+ return [K,K,(/^[CKQ]$/.test(str.charAt(pos+1)) && !($.in_array(['CE','CI'],str.substr(pos+1,2))))? 2: 1];
81
+ }
82
+ case cl==D:
83
+ if(str.charAt(pos+1)==G){
84
+ if(/^[IEY]$/.test(str.charAt(pos+2)))
85
+ return [J,J,3];
86
+ else
87
+ return ['TK','TK',2];
88
+ }
89
+ else
90
+ return [T,T,(/^[DT]$/.test(str.charAt(pos+1)))? 2:1];
91
+ case cl==F:
92
+ return [F,F,(F==str.charAt(pos+1))?2:1];
93
+ case cl==G:
94
+ if(H==str.charAt(pos+1)){
95
+ if(pos && !str.charAt(pos-1).vowel())
96
+ return [K,K,2];
97
+ else if(!pos){
98
+ if(I == str.charAt(pos+2))
99
+ return [J,J,2];
100
+ else
101
+ return [K,K,2];
102
+ }
103
+ else if((pos>1 && /^[BHD]$/.test(str.charAt(pos-2))) || (pos>2 && /^[BHD]$/.test(str.charAt(pos-3))) || (pos>3 && /^B|H$/.test(str.charAt(pos-4))))
104
+ return [null,null,2];
105
+ else{
106
+ if(pos>2 && U == str.charAt(pos-1) && /^[CGLRT]$/.test(str.charAt(pos-3)))
107
+ return [F,F,2];
108
+ else{
109
+ if(pos && I != str.charAt(pos-1))
110
+ return [K,K,2];
111
+ else
112
+ return [null,null,2];
113
+ }
114
+ }
115
+ }
116
+ else if(N==str.charAt(pos+1)){
117
+ if(1==pos && str.charAt(0).vowel() && !str.slavo_germanic())
118
+ return ['KN',N,2];
119
+ else{
120
+ if('EY' != str.substr(pos+2,2) && Y != str.charAt(pos+1) && !str.slavo_germanic())
121
+ return [N,'KN',2];
122
+ else
123
+ return ['KN',N,2];
124
+ }
125
+ }
126
+ else if('LI'==str.substr(pos+1,2))
127
+ return ['KL',L,2];
128
+ else if(!pos && (Y==str.charAt(pos+1) || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/.test(str.substr(pos+1,2))))
129
+ return [K,J,2];
130
+ else if(('ER' == str.substr(pos+1,2) || Y == str.charAt(pos+1)) && !/^(D|R|M)ANGER$/.test(str.substr(0,6)) && !/^E|I$/.test(str.charAt(pos-1)) && !/^(R|O)GY$/.test(str.substr(pos-1,3)))
131
+ return [K,J,2];
132
+ else if(/^[EIY]$/.test(str.charAt(pos+1)) || /^(A|O)GGI$/.test(str.substr(pos-1,4))){
133
+ if(/^V(A|O)N $/.test(str.substr(0,4)) || 'SCH' == str.substr(0,3) || 'ET' == str.substr(pos+1,2))
134
+ return [K,K,2];
135
+ else{
136
+ if ('IER ' == str.substr(pos+1,4))
137
+ return [J,J,2];
138
+ else
139
+ return [J,K,2];
140
+ }
141
+ }
142
+ else if(G==str.charAt(pos+1))
143
+ return [K,K,2];
144
+ else
145
+ return [K,K,1];
146
+ case cl==H:
147
+ if(!pos || str.charAt(pos-1).vowel() && str.charAt(pos+1).vowel())
148
+ return [H,H,2];
149
+ else
150
+ return [null,null,1];
151
+ case cl==J:
152
+ if ('OSE' == str.substr(pos+1,3) || 'SAN ' == str.substr(0,4)){
153
+ if((!pos && ' ' == str.charAt(pos+4)) || 'SAN ' == str.substr(0,4))
154
+ return [H,H,1];
155
+ else
156
+ return [J,H,1];
157
+ }
158
+ else{
159
+ var current = (J==str.charAt(pos+1))? 2 : 1;
160
+ if(!pos && 'OSE' != str.substr(pos+1,3))
161
+ return [J,A,current];
162
+ else{
163
+ if (str.charAt(pos-1).vowel() && !str.slavo_germanic() && /^A|O$/.test(str.charAt(pos+1)))
164
+ return [J,H,current];
165
+ else{
166
+ if(last == pos)
167
+ return [J,null, current];
168
+ else{
169
+ if (!/^[LTKSNMBZ]$/.test(str.charAt(pos+1)) && !/^[SKL]$/.test(str.charAt(pos-1)))
170
+ return [J,J,current];
171
+ else
172
+ return [null,null,current];
173
+ }
174
+ }
175
+ }
176
+ }
177
+ case cl==K:
178
+ return [K,K,(K==str.charAt(pos+1))? 2 : 1];
179
+ case cl==L:
180
+ if(L==str.charAt(pos+1)){
181
+ if(((length-3)==pos && /^(ILL(O|A)|ALLE)$/.test(str.substr(pos-1,4))) || (/^(A|O)S$/.test(str.substr(last-1,2)) || /^A|O$/.test(str.charAt(last)) && 'ALLE'== str.substr(pos-1,4)))
182
+ return [L,null,2];
183
+ else
184
+ return [L,L,2];
185
+ }
186
+ else
187
+ return [L,L,1];
188
+ case cl==M:
189
+ if(('UMB' == str.substr(pos-1,3) && (last-1 == pos || 'ER' == str.substr(pos+2,2))) || M == str.charAt(pos+1))
190
+ return [M,M,2];
191
+ else
192
+ return [M,M,1];
193
+ case cl==N:
194
+ return [N,N,(N==str.charAt(pos+1))? 2: 1];
195
+ case cl=='Ñ':
196
+ return [N,N,1];
197
+ case cl==P:
198
+ if(H==str.charAt(pos+1))
199
+ return [F,F,2];
200
+ else
201
+ return [P,P,(/^P|B$/.test(str.charAt(pos+1)))? 2 : 1];
202
+ case cl==Q:
203
+ return [K,K,(Q==str.charAt(+1))? 2: 1];
204
+ case cl==R:
205
+ var current =(R==str.charAt(pos+1))? 2 : 1;
206
+ if(last == pos && !str.slavo_germanic() && 'IE' == str.substr(pos-2,2) && !/^M(E|A)$/.test(str.substr(pos-4,2)))
207
+ return [null,R,current];
208
+ else
209
+ return [R,R,current];
210
+ case cl==S:
211
+ if(/^(I|Y)SL$/.test(str.substr(pos-1,3)))
212
+ return [null,null,1];
213
+ else if (H==str.charAt(pos+1)){
214
+ if (/^H(EIM|OEK|OLM|OLZ)$/.test(str.substr(pos+1,4)))
215
+ return [S,S,2];
216
+ else
217
+ return [X,X,2];
218
+ }
219
+ else if (/^I(O|A)$/.test(str.substr(pos+1,2)))
220
+ return [S,(str.slavo_germanic())? S : X, 3];
221
+ else if ((!pos && /^[MNLW]$/.test(str.charAt(+1))) || Z==str.charAt(pos+1))
222
+ return [S,X,(Z==str.charAt(pos+1))? 2 : 1];
223
+ else if (C== str.charAt(pos+1)){
224
+ if (H== str.charAt(pos+2)){
225
+ if (/^OO|ER|EN|UY|ED|EM$/.test(str.substr(pos+3,2)))
226
+ return [(/^E(R|N)$/.test(str.substr(pos+3,2)))? X : 'SK','SK',3];
227
+ else
228
+ return [X,((!pos && !str.charAt(3).vowel()) && (W != str.charAt(pos+3)))? S : X,3];
229
+ }
230
+ else if (/^[IEY]$/.test(str.charAt(pos+2)))
231
+ return [S,S,3];
232
+ else
233
+ return ['SK','SK',3];
234
+ }
235
+ else
236
+ return [(last == pos && /^(A|O)I$/.test(str.substr(pos-2,2)))? null : S,S,(/^S|Z$/.test(str.charAt(pos+1)))? 2 : 1];
237
+ case cl==T:
238
+ if ('ION' == str.substr(pos+1,3) || /^IA|CH$/.test(str.substr(pos+1,2)))
239
+ return [X,X,3];
240
+ else if(H==str.charAt(pos+1) || 'TH' == str.substr(pos+1,2)){
241
+ if(/^(O|A)M$/.test(str.substr(pos+2,2)) || /^V(A|O)N $/.test(str.substr(0,4)) || 'SCH'== str.substr(0,3))
242
+ return [T,T,2];
243
+ else
244
+ return['0',T,2];
245
+ }
246
+ else return [T,T,(/^T|D$/.test(str.charAt(pos+1)))? 2 : 1];
247
+ case cl==V:
248
+ return [F,F,(V==str.charAt(pos+1))? 2 : 1];
249
+ case cl==W:
250
+ if(R==str.charAt(pos+1))
251
+ return [R,R,2];
252
+ var pri = '';
253
+ var sec = '';
254
+ if(!pos && str.charAt(pos+1).vowel() || H==str.charAt(pos+1)){
255
+ pri = A;
256
+ sec = (str.charAt(pos+1).vowel())? F : A;
257
+ }
258
+ if(last == pos && str.charAt(pos-1).vowel() || 'SCH' == str.substr(0,3) || /^EWSKI|EWSKY|OWSKI|OWSKY$/.test(str.substr(pos-1,5)))
259
+ return [pri,sec+F,1];
260
+ else if(/^I(C|T)Z$/.test(str.substr(pos+1,3)))
261
+ return [pri+'TS',sec+'FX',4];
262
+ else
263
+ return [pri,sec,1];
264
+ case cl==X:
265
+ var current = (/^C|X$/.test(str.charAt(pos+1)))? 2 : 1;
266
+ if (last == pos && (/^(I|E)AU$/.test(str.substr(pos-3,3)) || /^(A|O)U$/.test(str.substr(pos-2, 2))))
267
+ return [null,null, current];
268
+ else
269
+ return ['KS','KS',current];
270
+ case cl==Z:
271
+ if(H==str.charAt(pos+1))
272
+ return [J,J,2];
273
+ else{
274
+ var current = (Z==str.charAt(pos+1)) ? 2 : 1;
275
+ if(/^Z(O|I|A)$/.test(str.substr(pos+1,2)) || (str.slavo_germanic() && (pos > 0 && T != str.charAt(pos-1))))
276
+ return [S,'TS',current];
277
+ else
278
+ return [S,S,current];
279
+ }
280
+ }
281
+ return [null,null,1];
282
+ };
283
+ $.in_array = function(arr,p_val){
284
+ for(var i = 0, l = arr.length; i < l; i++){
285
+ if(arr[i] == p_val)
286
+ return true;
287
+ }
288
+ return false;
289
+ };
290
+ })();