thomaspeklak-OfflineSearch 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ # extends the string class to convert html entities
2
+ # use carefully, can not convert entities back, as some entities are just skipped, because the are not useful for the search generation process
3
+ #
4
+ # * $Author$
5
+ # * $Rev$
6
+ # * $LastChangedDate$
7
+
8
+ class String
9
+ # this method converts encoded entities to their utf-8 euqivalent. be careful this method strips out all unknown entities because they are of no special use for the semantic search
10
+ def decode_html_entities
11
+ mgsub([[/ä/,'ä'],[/Ä/,'Ä'],[/ö/,'ö'],[/Ö/,'Ö'],[/ü/,'ü'],[/Ü/,'Ü'],[/ß/,'ß'],[/&[a-zA-Z]{4,6};/,' ']])
12
+ end
13
+
14
+ # encodes html entities
15
+ def encode_html_entities
16
+ mgsub([[/ä/,'ä'],[/Ä/,'Ä'],[/ö/,'ö'],[/Ö/,'Ö'],[/ü/,'ü'],[/U/,'Ü'],[/ß/,'ß']])
17
+ end
18
+
19
+ # converts uppercase umlauts to downcase
20
+ def umlaut_to_downcase
21
+ mgsub([[/Ä/,'ä'],[/Ö/,'ö'],[/Ü/,'ü']])
22
+ end
23
+
24
+ private
25
+
26
+ # method to substitute multiple strings at once. [Author: Ruby Cookbook]
27
+ def mgsub(key_value_pairs=[].freeze)
28
+ regexp_fragments = key_value_pairs.collect { |k,v| k }
29
+ gsub(Regexp.union(*regexp_fragments)) do |match|
30
+ key_value_pairs.detect{|k,v| k =~ match}[1]
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,9 @@
1
+ # generates a default configuration file in the current path
2
+ #
3
+ # * $Author$
4
+ # * $Rev$
5
+ # * $LastChangedDate$
6
+ #
7
+ require 'fileutils'
8
+ include FileUtils
9
+ cp(File.dirname(__FILE__)+'/config_default.yaml','./')
@@ -0,0 +1,18 @@
1
+ # generates the default stopword list in the current directory
2
+ # the language is taken from the language switch
3
+ #
4
+ # * $Author$
5
+ # * $Rev$
6
+ # * $LastChangedDate$
7
+ #
8
+ require 'fileutils'
9
+ include FileUtils
10
+
11
+ language = ['german','english']
12
+ unless(defined?($config) && language.include?($config['language']))
13
+ $logger.error('language must be english or german')
14
+ exit
15
+ end
16
+
17
+
18
+ cp(File.dirname(__FILE__) +"/stoplist/#{$config['language']}/stopwords.txt",'./')
@@ -0,0 +1,38 @@
1
+ # generates template files
2
+ # currently only one template is supported
3
+ #
4
+ # * $Author$
5
+ # * $Rev$
6
+ # * $LastChangedDate$
7
+ #
8
+
9
+ class TemplateGenerator
10
+ def initialize(template)
11
+ @template = template
12
+ find_files
13
+ copy_files_to_current_path
14
+ end
15
+
16
+ private
17
+ # serach the given docpath for files
18
+ # returns an array of files
19
+ def find_files()
20
+ require 'find'
21
+ directory = File.dirname(__FILE__) + '/../templates/' + @template
22
+ @files = Array.new()
23
+ Find.find(directory) do |f|
24
+ if FileTest.file?f
25
+ @files.push(f)
26
+ end
27
+ end
28
+ @files
29
+ end
30
+
31
+ #copies the found files in the current path
32
+ def copy_files_to_current_path()
33
+ require 'fileutils'
34
+ @files.each do |f|
35
+ FileUtils::cp(f,'./')
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,16 @@
1
+ # configures the ruby logger
2
+ #
3
+ # * $Author$
4
+ # * $Rev$
5
+ # * $LastChangedDate$
6
+ #
7
+
8
+ require 'logger'
9
+ unless ($config.has_key?('logger'))
10
+ $logger = Logger.new(STDOUT)
11
+ $logger.level = Logger::INFO
12
+ else
13
+ $logger = ($config['logger']['file'] == 'STDOUT')? Logger.new(STDOUT) : Logger.new($config['logger']['file'])
14
+ $logger.level = eval("Logger::#{$config['logger']['level'].upcase}")
15
+ end
16
+
@@ -0,0 +1,11 @@
1
+ # Start point for OS
2
+ #
3
+ # * $Author$
4
+ # * $Rev$
5
+ # * $LastChangedDate$
6
+ #
7
+
8
+ require 'option_parser'
9
+
10
+ require 'action_controller'
11
+ ActionController.new
@@ -0,0 +1,61 @@
1
+ #parses command line options and merges them into the config file
2
+ #
3
+ # * $Author$
4
+ # * $Rev$
5
+ # * $LastChangedDate$
6
+ #
7
+ require "YAML"
8
+
9
+ require 'optparse'
10
+ $config = Hash.new
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: OfflineSearch [options]"
13
+ opts.on('-c', '--config=CONFIG_FILE', String,'configuration file for the offline search') do |c|
14
+ if (File.exists?(c))
15
+ $config = YAML.load_file(c)
16
+ else
17
+ $logger.error('config file not found')
18
+ exit
19
+ end
20
+ end
21
+ opts.separator ""
22
+ opts.separator "Generators"
23
+ opts.on('-g','--generate-default-config','creates a default config file in the current directory') do
24
+ $action = 'generate_default_config'
25
+ end
26
+ opts.on('-w','--generate-default-stopwords','creates a default stopword list in the current directory. Language flag is required.') do
27
+ $action = 'generate_default_stopwords'
28
+ end
29
+ opts.on('-t','--generate-template=TEMPLATE','creates search template files in the current directory. Possible values: base, base+double_metaphone') do |t|
30
+ $action = 'generate_template'
31
+ $config['template']=t
32
+ end
33
+ opts.on('-o','--generate-search-data','crawler the documents in the given docpath and generates the search data file') do
34
+ $action = 'generate_search'
35
+ end
36
+ opts.separator ""
37
+ opts.separator "Optional arguments"
38
+ opts.separator "can also be specified in the config file"
39
+ opts.separator "command line arguments will overwrite any given value in the config file"
40
+ opts.on('-d', '--docpath=DOCPATH', String,'path of the documents') do |d|
41
+ $config['crawler']['docpath'] = d
42
+ end
43
+ opts.on('-f', '--search-data-file=SEARCH_DATA_FILE', String,'path and name of the search data file') do |f|
44
+ $config['search_generator']['search_data_file'] = f
45
+ end
46
+ opts.on('-s', '--stopword-list=STOPWORD_LIST', String,'stopword list, if none is specified the default stop word list is used') do |s|
47
+ $config['crawler']['stopwords'] = s
48
+ end
49
+ opts.on('-l','--language=LANGUAGE',String,'required if you want to generate a default stopword list') do |l|
50
+ $config['language'] = l
51
+ end
52
+ opts.separator ""
53
+ opts.on_tail('-h','--help','Show this message') do
54
+ puts opts
55
+ exit
56
+ end
57
+ if (opts.default_argv.size == 0)
58
+ puts opts
59
+ exit
60
+ end
61
+ end.parse!
@@ -0,0 +1,53 @@
1
+ # checks if all required arguments are specified and if their values are correct
2
+ #
3
+ # * $Author$
4
+ # * $Rev$
5
+ # * $LastChangedDate$
6
+ #
7
+ class OptionValidator
8
+ def initialize
9
+ storage = ['memory','sqlite']
10
+ language = ['german','english']
11
+
12
+ if ($config['crawler']['stopwords'].nil?) then
13
+ $config['crawler']['stopwords'] = File.dirname(__FILE__) +"/stoplist/#{$config['language']}/stopwords.txt"
14
+ end
15
+
16
+ unless(language.include?($config['language']))
17
+ $logger.error('language must be english or german')
18
+ exit
19
+ end
20
+ unless(storage.include?($config['storage']))
21
+ $logger.error('storage must be memory or sqlite')
22
+ exit
23
+ end
24
+ unless($config['crawler']['docs'].size>0)
25
+ $logger.error('doc types must be specified')
26
+ exit
27
+ end
28
+ unless (File.exists?($config['crawler']['stopwords']))
29
+ $logger.error('stopwords file does not exist')
30
+ exit
31
+ end
32
+
33
+ unless (directory_exists?($config['crawler']['docpath']))
34
+ $logger.error('docpath does not exist')
35
+ end
36
+
37
+ unless (base_directory_exists?($config['search_generator']['search_data_file']))
38
+ $logger.error('path to the search data file does not exits. Please create the directory first')
39
+ end
40
+
41
+ unless (base_directory_exists?($config['search_generator']['output_frequency_to']))
42
+ $logger.error('path to the frequency file does not exits. Please create the directory first')
43
+ end
44
+ end
45
+
46
+ private
47
+ def base_directory_exists?(file)
48
+ FileTest.directory?(File.dirname(file))
49
+ end
50
+ def directory_exists?(dir)
51
+ FileTest.directory?(dir)
52
+ end
53
+ end
@@ -0,0 +1,112 @@
1
+ # generates the data for the search
2
+ #
3
+ # * $Author$
4
+ # * $Rev$
5
+ # * $LastChangedDate$
6
+ #
7
+
8
+ class SearchGenerator
9
+ # needs files and terms and an entry in the config representing the location of the javascript file
10
+ def initialize(files, terms)
11
+ @files = files
12
+ @terms = terms
13
+ $logger.info("writing data to #{$config['search_generator']['search_data_file']}")
14
+ @search_data_file = File.new($config['search_generator']['search_data_file'],'w')
15
+ end
16
+
17
+ # generates the search data
18
+ def generate
19
+ ($config['search_generator']['use_double_metaphone'] && $config['search_generator']['use_double_metaphone'] == true)? generate_terms_for_dm : generate_terms
20
+ generate_files
21
+ generate_relative_path
22
+ generate_frequency_file if ($config['search_generator']['output_frequency_to'])
23
+ generate_double_metaphone if ($config['search_generator']['use_double_metaphone'] && $config['search_generator']['use_double_metaphone'] == true)
24
+ cleanup
25
+ end
26
+
27
+ private
28
+ # generates a javascript hash of the indexed terms and writes it to the javascript file
29
+ # term => document id, rank
30
+ def generate_terms
31
+ $logger.info("generating term base")
32
+ out = Array.new
33
+ out << "var terms = {"
34
+ @terms.each do |term, reference|
35
+ out << "'#{term}':["
36
+ docs = Hash.new
37
+ reference.each { |r| docs.has_key?(r.document.ID)? docs[r.document.ID]+=r.rank : docs[r.document.ID] = r.rank }
38
+ # because of a javascript performance issue with nested arrays, the page id and the page rank are put into a string and split in the javascript search an demand
39
+ docs.sort{ |a,b| a[1]<=>b[1]}.reverse.each{ |doc_ID, rank| out << "'#{doc_ID}-#{rank}',"}
40
+ out << "],"
41
+ end
42
+ @search_data_file.puts out.join.gsub(',]',']')[0..-2] + "};"
43
+ end
44
+
45
+
46
+ # generates a javascript hash of the indexed terms and writes it to the javascript file
47
+ # term => document id, rank
48
+ def generate_terms_for_dm
49
+ $logger.info("generating term base")
50
+ outTerms = Array.new
51
+ outTerms << "var terms = {"
52
+ out = Array.new
53
+ out << "var ranks = ["
54
+ i = 0
55
+ @terms.each do |term, reference|
56
+ outTerms << "'#{term}':#{i},"
57
+ i += 1
58
+ out<<"["
59
+ docs = Hash.new
60
+ reference.each { |r| docs.has_key?(r.document.ID)? docs[r.document.ID]+=r.rank : docs[r.document.ID] = r.rank }
61
+ # because of a javascript performance issue with nested arrays, the page id and the page rank are put into a string and split in the javascript search an demand
62
+ docs.sort{ |a,b| a[1]<=>b[1]}.reverse.each{ |doc_ID, rank| out << "'#{doc_ID}-#{rank}',"}
63
+ out << "],"
64
+ end
65
+ @search_data_file.puts outTerms.join.gsub(',]',']')[0..-2] + '};'
66
+ @search_data_file.puts out.join.gsub(',]',']')[0..-2] + "];"
67
+ end
68
+
69
+ # generates a javascript hash of file ids => title, file name, pagerank
70
+ def generate_files
71
+ $logger.info("generating file base")
72
+ out = Array.new
73
+ out << "var files = {"
74
+ @files.each_value do |f|
75
+ out << "#{f.ID}:[\"#{f.title}\",'#{f.name[1..-1]}',#{f.page_rank}],"
76
+ end
77
+ @search_data_file.puts out.join[0..-2] + "};"
78
+ end
79
+
80
+ # stores the relative path in a vairable
81
+ def generate_relative_path
82
+ $logger.info("generating relative path")
83
+ @search_data_file.puts "var rel_path = '#{$config['search_generator']['relative_path_to_files'].gsub(/\/$/,'')}/';" if $config['search_generator'].has_key?('relative_path_to_files')
84
+ end
85
+
86
+ def generate_frequency_file
87
+ $logger.info("generating frequency file")
88
+ File.open($config['search_generator']['output_frequency_to'],'w') do |f|
89
+ @terms.each do |term,reference|
90
+ f.puts "#{term} #{reference.size}"
91
+ end
92
+
93
+ end
94
+ end
95
+
96
+ def generate_double_metaphone()
97
+ $logger.info("generating double metaphone data")
98
+ require 'Text'
99
+ out = Array.new
100
+ out << 'var dm_data = ['
101
+ @terms.each do |t,r|
102
+ temp = Text::Metaphone.double_metaphone(t)
103
+ out << "['#{temp[0]}'#{(temp[1])? ',\''+temp[1]+'\'':nil}],"
104
+ end
105
+ @search_data_file.puts out.join[0..-2] + "];"
106
+ end
107
+
108
+ # performs cleanup operations
109
+ def cleanup
110
+ @search_data_file.close
111
+ end
112
+ end
@@ -0,0 +1,7 @@
1
+ $stop_words = Hash.new
2
+
3
+ File.open($config['crawler']['stopwords']) do |f|
4
+ while line = f.gets
5
+ $stop_words[line.chomp]=nil
6
+ end
7
+ end
@@ -0,0 +1,317 @@
1
+ a
2
+ abandon
3
+ abandoned
4
+ abc
5
+ able
6
+ about
7
+ above
8
+ absence
9
+ absent
10
+ absolute
11
+ absolutely
12
+ ac
13
+ accordance
14
+ according
15
+ accordingly
16
+ acct
17
+ across
18
+ active
19
+ actively
20
+ actual
21
+ actually
22
+ ad
23
+ adb
24
+ adds
25
+ adequate
26
+ adequately
27
+ adjacent
28
+ adversely
29
+ after
30
+ afterwards
31
+ again
32
+ against
33
+ ago
34
+ ahead
35
+ alike
36
+ all
37
+ almost
38
+ alone
39
+ along
40
+ already
41
+ also
42
+ although
43
+ always
44
+ am
45
+ among
46
+ amongst
47
+ an
48
+ analog
49
+ and
50
+ another
51
+ any
52
+ anybody
53
+ anymore
54
+ anyone
55
+ anything
56
+ anyway
57
+ anywhere
58
+ apart
59
+ approx
60
+ are
61
+ around
62
+ as
63
+ aside
64
+ at
65
+ auto
66
+ avail
67
+ available
68
+ aware
69
+ away
70
+ awhile
71
+ badly
72
+ be
73
+ became
74
+ because
75
+ become
76
+ becomes
77
+ becoming
78
+ been
79
+ before
80
+ being
81
+ below
82
+ beside
83
+ besides
84
+ better
85
+ between
86
+ beyond
87
+ brief
88
+ briefly
89
+ but
90
+ by
91
+ bye
92
+ can
93
+ come
94
+ comes
95
+ comfortable
96
+ common
97
+ commonly
98
+ completely
99
+ cons
100
+ continually
101
+ continue
102
+ continued
103
+ continues
104
+ continuing
105
+ continuous
106
+ continuously
107
+ could
108
+ couple
109
+ course
110
+ current
111
+ currently
112
+ definitely
113
+ despite
114
+ did
115
+ directly
116
+ do
117
+ does
118
+ doing
119
+ done
120
+ during
121
+ each
122
+ either
123
+ empty
124
+ enough
125
+ even
126
+ eventually
127
+ ever
128
+ every
129
+ everybody
130
+ everyone
131
+ everything
132
+ everywhere
133
+ ex
134
+ exact
135
+ exactly
136
+ for
137
+ from
138
+ generally
139
+ had
140
+ has
141
+ have
142
+ he
143
+ hello
144
+ her
145
+ hereby
146
+ herein
147
+ hi
148
+ him
149
+ himself
150
+ his
151
+ how
152
+ however
153
+ ideally
154
+ if
155
+ in
156
+ indeed
157
+ inner
158
+ into
159
+ is
160
+ it
161
+ its
162
+ itself
163
+ just
164
+ like
165
+ lot
166
+ lots
167
+ many
168
+ may
169
+ maybe
170
+ me
171
+ mean
172
+ mere
173
+ merely
174
+ might
175
+ more
176
+ most
177
+ mostly
178
+ must
179
+ my
180
+ myself
181
+ neither
182
+ not
183
+ obvious
184
+ obviously
185
+ of
186
+ off
187
+ on
188
+ once
189
+ one
190
+ ones
191
+ ongoing
192
+ only
193
+ onto
194
+ other
195
+ others
196
+ otherwise
197
+ ought
198
+ our
199
+ ours
200
+ out
201
+ outer
202
+ over
203
+ particular
204
+ particularly
205
+ please
206
+ previous
207
+ previously
208
+ ready
209
+ really
210
+ recent
211
+ recently
212
+ relative
213
+ relatively
214
+ same
215
+ see
216
+ seldom
217
+ self
218
+ serious
219
+ seriously
220
+ set
221
+ similar
222
+ since
223
+ sincerely
224
+ so
225
+ some
226
+ somebody
227
+ someday
228
+ somehow
229
+ someone
230
+ someplace
231
+ something
232
+ sometime
233
+ sometimes
234
+ somewhat
235
+ somewhere
236
+ sorely
237
+ sorry
238
+ successful
239
+ successfully
240
+ such
241
+ suddenly
242
+ suitable
243
+ sure
244
+ surely
245
+ than
246
+ thank
247
+ thanks
248
+ that
249
+ thats
250
+ the
251
+ their
252
+ them
253
+ theme
254
+ themselves
255
+ then
256
+ there
257
+ thereby
258
+ therefore
259
+ these
260
+ they
261
+ this
262
+ thorough
263
+ thoroughly
264
+ those
265
+ though
266
+ through
267
+ throughout
268
+ throughput
269
+ to
270
+ today
271
+ together
272
+ total
273
+ totally
274
+ toward
275
+ towards
276
+ typical
277
+ typically
278
+ until
279
+ up
280
+ upon
281
+ usually
282
+ various
283
+ very
284
+ want
285
+ was
286
+ we
287
+ well
288
+ went
289
+ were
290
+ what
291
+ whatever
292
+ whats
293
+ whatsoever
294
+ when
295
+ where
296
+ which
297
+ while
298
+ who
299
+ whoever
300
+ whole
301
+ whom
302
+ whose
303
+ why
304
+ will
305
+ with
306
+ within
307
+ without
308
+ worth
309
+ worthwhile
310
+ worthy
311
+ would
312
+ yes
313
+ yet
314
+ you
315
+ your
316
+ yours
317
+ yourself