gonzui 1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. data/AUTHORS.txt +9 -0
  2. data/History.txt +5539 -0
  3. data/Manifest.txt +115 -0
  4. data/PostInstall.txt +17 -0
  5. data/README.rdoc +149 -0
  6. data/Rakefile +28 -0
  7. data/bin/gonzui-db +167 -0
  8. data/bin/gonzui-import +177 -0
  9. data/bin/gonzui-remove +58 -0
  10. data/bin/gonzui-search +68 -0
  11. data/bin/gonzui-server +176 -0
  12. data/bin/gonzui-update +53 -0
  13. data/data/gonzui/catalog/catalog.ja +80 -0
  14. data/data/gonzui/doc/favicon.ico +0 -0
  15. data/data/gonzui/doc/folder.png +0 -0
  16. data/data/gonzui/doc/gonzui.css +279 -0
  17. data/data/gonzui/doc/gonzui.js +111 -0
  18. data/data/gonzui/doc/text.png +0 -0
  19. data/data/gonzuirc.sample +29 -0
  20. data/ext/autopack/autopack.c +88 -0
  21. data/ext/autopack/extconf.rb +3 -0
  22. data/ext/delta/delta.c +147 -0
  23. data/ext/delta/extconf.rb +5 -0
  24. data/ext/texttokenizer/extconf.rb +5 -0
  25. data/ext/texttokenizer/texttokenizer.c +93 -0
  26. data/ext/xmlformatter/extconf.rb +5 -0
  27. data/ext/xmlformatter/xmlformatter.c +207 -0
  28. data/lib/gonzui.rb +59 -0
  29. data/lib/gonzui/apt.rb +193 -0
  30. data/lib/gonzui/bdbdbm.rb +118 -0
  31. data/lib/gonzui/cmdapp.rb +14 -0
  32. data/lib/gonzui/cmdapp/app.rb +175 -0
  33. data/lib/gonzui/cmdapp/search.rb +134 -0
  34. data/lib/gonzui/config.rb +117 -0
  35. data/lib/gonzui/content.rb +19 -0
  36. data/lib/gonzui/dbm.rb +673 -0
  37. data/lib/gonzui/deindexer.rb +162 -0
  38. data/lib/gonzui/delta.rb +49 -0
  39. data/lib/gonzui/extractor.rb +347 -0
  40. data/lib/gonzui/fetcher.rb +309 -0
  41. data/lib/gonzui/gettext.rb +144 -0
  42. data/lib/gonzui/importer.rb +84 -0
  43. data/lib/gonzui/indexer.rb +316 -0
  44. data/lib/gonzui/info.rb +80 -0
  45. data/lib/gonzui/license.rb +100 -0
  46. data/lib/gonzui/logger.rb +48 -0
  47. data/lib/gonzui/monitor.rb +177 -0
  48. data/lib/gonzui/progressbar.rb +235 -0
  49. data/lib/gonzui/remover.rb +38 -0
  50. data/lib/gonzui/searcher.rb +330 -0
  51. data/lib/gonzui/searchquery.rb +235 -0
  52. data/lib/gonzui/searchresult.rb +111 -0
  53. data/lib/gonzui/updater.rb +254 -0
  54. data/lib/gonzui/util.rb +415 -0
  55. data/lib/gonzui/vcs.rb +128 -0
  56. data/lib/gonzui/webapp.rb +25 -0
  57. data/lib/gonzui/webapp/advsearch.rb +123 -0
  58. data/lib/gonzui/webapp/filehandler.rb +24 -0
  59. data/lib/gonzui/webapp/jsfeed.rb +61 -0
  60. data/lib/gonzui/webapp/markup.rb +445 -0
  61. data/lib/gonzui/webapp/search.rb +269 -0
  62. data/lib/gonzui/webapp/servlet.rb +319 -0
  63. data/lib/gonzui/webapp/snippet.rb +155 -0
  64. data/lib/gonzui/webapp/source.rb +37 -0
  65. data/lib/gonzui/webapp/stat.rb +137 -0
  66. data/lib/gonzui/webapp/top.rb +63 -0
  67. data/lib/gonzui/webapp/uri.rb +140 -0
  68. data/lib/gonzui/webapp/webrick.rb +48 -0
  69. data/script/console +10 -0
  70. data/script/destroy +14 -0
  71. data/script/generate +14 -0
  72. data/script/makemanifest.rb +21 -0
  73. data/tasks/extconf.rake +13 -0
  74. data/tasks/extconf/autopack.rake +43 -0
  75. data/tasks/extconf/delta.rake +43 -0
  76. data/tasks/extconf/texttokenizer.rake +43 -0
  77. data/tasks/extconf/xmlformatter.rake +43 -0
  78. data/test/_external_tools.rb +13 -0
  79. data/test/_test-util.rb +142 -0
  80. data/test/foo/Makefile.foo +66 -0
  81. data/test/foo/bar.c +5 -0
  82. data/test/foo/bar.h +6 -0
  83. data/test/foo/foo.c +25 -0
  84. data/test/foo/foo.spec +33 -0
  85. data/test/test_apt.rb +42 -0
  86. data/test/test_autopack_extn.rb +7 -0
  87. data/test/test_bdbdbm.rb +79 -0
  88. data/test/test_cmdapp-app.rb +35 -0
  89. data/test/test_cmdapp-search.rb +99 -0
  90. data/test/test_config.rb +28 -0
  91. data/test/test_content.rb +15 -0
  92. data/test/test_dbm.rb +171 -0
  93. data/test/test_deindexer.rb +50 -0
  94. data/test/test_delta.rb +66 -0
  95. data/test/test_extractor.rb +78 -0
  96. data/test/test_fetcher.rb +75 -0
  97. data/test/test_gettext.rb +50 -0
  98. data/test/test_gonzui.rb +11 -0
  99. data/test/test_helper.rb +10 -0
  100. data/test/test_importer.rb +56 -0
  101. data/test/test_indexer.rb +37 -0
  102. data/test/test_info.rb +82 -0
  103. data/test/test_license.rb +49 -0
  104. data/test/test_logger.rb +60 -0
  105. data/test/test_monitor.rb +23 -0
  106. data/test/test_searcher.rb +37 -0
  107. data/test/test_searchquery.rb +27 -0
  108. data/test/test_searchresult.rb +43 -0
  109. data/test/test_texttokenizer.rb +47 -0
  110. data/test/test_updater.rb +95 -0
  111. data/test/test_util.rb +149 -0
  112. data/test/test_vcs.rb +61 -0
  113. data/test/test_webapp-markup.rb +42 -0
  114. data/test/test_webapp-util.rb +19 -0
  115. data/test/test_webapp-xmlformatter.rb +19 -0
  116. metadata +291 -0
@@ -0,0 +1,84 @@
1
+ #
2
+ # importer.rb - import contents to gonzui.db
3
+ #
4
+ # Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ require 'uri'
13
+
14
+ module Gonzui
15
+ class ImporterError < GonzuiError; end
16
+
17
+ class Importer < AbstractUpdater
18
+ def initialize(config, options = {})
19
+ super(config, options)
20
+ # to be initialized
21
+ @last_package_name = nil
22
+ end
23
+ attr_reader :last_package_name
24
+
25
+ private
26
+ def import_package(fetcher, source_uri)
27
+ package_name = fetcher.package_name
28
+ raise ImporterError.new("#{package_name}: already exists") if
29
+ @dbm.has_package?(package_name)
30
+
31
+ relative_paths = fetcher.collect
32
+ pbar = make_progress_bar(package_name, relative_paths.length)
33
+ begin
34
+ relative_paths.each {|relative_path|
35
+ begin
36
+ normalized_path = File.join(package_name, relative_path)
37
+ content = nil
38
+ begin
39
+ content = fetcher.fetch(relative_path)
40
+ rescue => e
41
+ vprintf("fetch failed: %s: %s\n%s", relative_path, e.message)
42
+ next
43
+ end
44
+ index_content(source_uri, normalized_path, content)
45
+ ensure
46
+ pbar.inc
47
+ end
48
+ }
49
+ ensure
50
+ @dbm.flush_cache
51
+ end
52
+ pbar.finish
53
+ @npackages += 1
54
+ @last_package_name = package_name
55
+ end
56
+
57
+ def do_task_name
58
+ "imported"
59
+ end
60
+
61
+ public
62
+ def import(source_uri)
63
+ fetcher = Fetcher.new(@config, source_uri)
64
+ begin
65
+ import_package(fetcher, source_uri)
66
+ ensure
67
+ fetcher.finish
68
+ end
69
+ end
70
+
71
+ def summary
72
+ summary = super
73
+ if @config.verbose
74
+ stat = Indexer.statistics
75
+ summary += "\n" + stat unless stat.empty?
76
+ end
77
+ return summary
78
+ end
79
+
80
+ def finish
81
+ @dbm.close
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,316 @@
1
+ #
2
+ # indexer.rb - indexer implementation
3
+ #
4
+ # Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ require 'ftools'
13
+ require 'digest/md5'
14
+ require 'langscan'
15
+
16
+ module Gonzui
17
+ class IndexerError < GonzuiError; end
18
+
19
+ class Indexer
20
+ include Util
21
+
22
+ @@performance_monitor = PerformanceMonitor.new
23
+
24
+ def self.statistics
25
+ return "" if @@performance_monitor.empty?
26
+ pm = @@performance_monitor
27
+ summary = "Performance statistics:\n"
28
+ summary << pm.heading
29
+ summary << pm.format([Indexer, :index],
30
+ [Indexer, :read_content],
31
+ [Indexer, :add_license],
32
+ [Indexer, :index_content])
33
+ labels = LangScan.modules.map {|m|
34
+ [m, :scan]
35
+ }.push([Indexer, :add_fragment],
36
+ [Indexer, :flush_cache])
37
+ summary << pm.format([Indexer, :index_content], *labels)
38
+ return summary
39
+ end
40
+
41
+ def initialize(config, dbm, source_uri, normalized_path, content,
42
+ options = {})
43
+ @config = config
44
+ @dbm = dbm
45
+ @normalized_path = normalized_path
46
+ @source_uri = source_uri
47
+ @content = content
48
+ @content_hash = Digest::MD5.hexdigest(content.text)
49
+ @noindex_formats = (options[:noindex_formats] or @config.noindex_formats)
50
+
51
+ @package_name = get_package_name
52
+ @seqno = 0
53
+
54
+ @word_cache = {}
55
+ @wordinfo_cache = {}
56
+ @digest_cache = []
57
+
58
+ # to be initialized
59
+ @format_id = nil
60
+ @license_id = nil
61
+ @license_abbrev = nil
62
+ @encoding = nil
63
+ @nlines = nil
64
+ @package_id = nil
65
+ @path_id = nil
66
+ @bols = [] # positions of beginning of lines
67
+ @indexed_p = false
68
+
69
+ initialize_profilers_if_necessary
70
+ end
71
+
72
+ def initialize_profilers_if_necessary
73
+ # profiler
74
+ if @config.verbose
75
+ @@performance_monitor.profile(Indexer, :index)
76
+ @@performance_monitor.profile(Indexer, :read_content)
77
+ @@performance_monitor.profile(Indexer, :index_content)
78
+ @@performance_monitor.profile(Indexer, :add_fragment)
79
+ @@performance_monitor.profile(Indexer, :add_license)
80
+ @@performance_monitor.profile(Indexer, :flush_cache)
81
+ end
82
+ end
83
+
84
+ def read_content
85
+ content, @encoding = normalize_content(@content.text)
86
+ @content.text = content
87
+ @nlines = 0
88
+ pos = 0
89
+ @content.text.each_line {|line|
90
+ @bols.push(pos)
91
+ @nlines += 1
92
+ pos += line.length
93
+ }
94
+ end
95
+
96
+ # allow 0x09 (TAB), 0x0a (LF), 0x0c(^L), 0x0d (CR) 0x1b (ESC)
97
+ allowed = [0x09, 0x0a, 0x0c, 0x0d, 0x1b]
98
+ pattern = "["
99
+ pattern << (0...0x20).find_all {|x|
100
+ not allowed.include?(x)
101
+ }.map {|x| sprintf("\\x%02x", x) }.join
102
+ pattern << "]"
103
+ BinaryRegexp = Regexp.new(pattern)
104
+
105
+ def binary_content?(content)
106
+ BinaryRegexp.match(content)
107
+ end
108
+
109
+ def convert_to_utf8(content)
110
+ encoding = "ascii"
111
+ if binary_content?(content)
112
+ encoding = "binary"
113
+ else
114
+ if @config.utf8
115
+ content, encoding = UTF8.to_utf8(content)
116
+ end
117
+ end
118
+ return content, encoding
119
+ end
120
+
121
+ def normalize_content(content)
122
+ content, encoding = convert_to_utf8(content)
123
+ unless encoding == "binary"
124
+ content = content.untabify
125
+ content.gsub!(/\r\n?/, "\n")
126
+ end
127
+ return content, encoding
128
+ end
129
+
130
+ def get_package_name
131
+ parts = @normalized_path.split("/")
132
+ if parts.length < 2
133
+ raise IndexerError.new("normalized path should not be flat")
134
+ end
135
+ package_name = parts.first
136
+ if package_name.size == 0 || package_name == "." || package_name == ".."
137
+ package_name = File.basename(@source_uri.path)
138
+ end
139
+ return package_name
140
+ end
141
+
142
+ def add_text(fragment, type_id)
143
+ text = fragment.text
144
+ byteno = fragment.byteno
145
+ TextTokenizer.each_word(text) {|word, pos|
146
+ add_word(word, byteno + pos, type_id)
147
+ }
148
+ end
149
+
150
+ def add_fragment(fragment)
151
+ type_id = @dbm.get_type_id(fragment.type)
152
+ if LangScan::Type.splittable?(fragment.type)
153
+ add_text(fragment, type_id)
154
+ else
155
+ add_word(fragment.text, fragment.byteno, type_id)
156
+ end
157
+
158
+ @digest_cache.push(fragment.byteno, fragment.text.length, type_id)
159
+ end
160
+
161
+ def flush_cache
162
+ all_word_ids = @wordinfo_cache.keys.sort!
163
+ all_word_ids.each {|word_id|
164
+ path_word_id = AutoPack.pack_id2(@path_id, word_id)
165
+ @dbm.pathwordid_info[path_word_id] =
166
+ DeltaDumper.dump_tuples(WordInfo, @wordinfo_cache[word_id])
167
+ }
168
+ @dbm.put_pathid_wordids(@package_id, @path_id, all_word_ids)
169
+ @dbm.pathid_wordids[@path_id] = DeltaDumper.dump_ids(all_word_ids)
170
+ @dbm.pathid_digest[@path_id] =
171
+ DeltaDumper.dump_tuples(DigestInfo, @digest_cache)
172
+ @dbm.pathid_bols[@path_id] = DeltaDumper.dump_fixnums(@bols)
173
+ @wordinfo_cache.clear
174
+ @dbm.word_id_counter.flush
175
+ end
176
+
177
+ def add_property(abbrev, name, counter, make_key, pkgid_ids)
178
+ id = @dbm.send(counter).get_id2(abbrev, name)
179
+ @dbm.send(pkgid_ids)[@package_id] = id
180
+ @dbm.increase_counter(@dbm.send(make_key, abbrev))
181
+ return id
182
+ end
183
+
184
+ def add_format(format_abbrev, format_name)
185
+ @format_id = add_property(format_abbrev,
186
+ format_name,
187
+ :format_id_counter,
188
+ :make_ncontents_by_format_key,
189
+ :pkgid_fmtids)
190
+ end
191
+
192
+ def add_license
193
+ detector = LicenseDetector.new(@content.text)
194
+ license = detector.detect
195
+ @license_id = add_property(license.abbrev,
196
+ license.name,
197
+ :license_id_counter,
198
+ :make_ncontents_by_license_key,
199
+ :pkgid_lcsids)
200
+ @license_abbrev = license.abbrev
201
+ end
202
+
203
+ def add_path
204
+ assert_equal(false, @dbm.path_pathid.include?(@normalized_path))
205
+ @path_id = @dbm.path_id_counter.make_new_id
206
+ @dbm.path_pathid[@normalized_path] = @path_id
207
+ @dbm.pathid_path[@path_id] = @normalized_path
208
+ @dbm.pkgid_pathids[@package_id] = @path_id
209
+ end
210
+
211
+ def get_fragments(scanner)
212
+ @@performance_monitor.profile(scanner, :scan) if @config.verbose
213
+ fragments = []
214
+ scanner.scan(@content.text) {|fragment|
215
+ fragments.push(fragment) if LangScan::Type.include?(fragment.type)
216
+ }
217
+ fragments = fragments.sort_by {|fragment| fragment.byteno }
218
+ return fragments
219
+ end
220
+
221
+ def add_word(word, byteno, type_id)
222
+ word_id = @dbm.word_id_counter.get_id(word)
223
+ array = (@wordinfo_cache[word_id] ||= [])
224
+ array.push(@seqno, byteno, type_id)
225
+ @seqno += 1
226
+ end
227
+
228
+ def add_package_if_necessary
229
+ if @dbm.has_package?(@package_name)
230
+ @package_id = @dbm.get_package_id(@package_name)
231
+ else
232
+ @package_id = @dbm.package_id_counter.make_new_id
233
+ @dbm.pkg_pkgid[@package_name] = @package_id
234
+ @dbm.pkgid_pkg[@package_id] = @package_name
235
+ @dbm.pkgid_src[@package_id] = @source_uri.to_s
236
+ @dbm.put_package_options(@package_id)
237
+ end
238
+ end
239
+
240
+ def make_content_info
241
+ ContentInfo.dump(@content.length, @content.mtime.to_i,
242
+ Time.now.to_i, @format_id, @license_id,
243
+ @nlines, @indexed_p)
244
+ end
245
+
246
+ def index_content(scanner)
247
+ fragments = []
248
+ begin
249
+ fragments = get_fragments(scanner)
250
+ rescue
251
+ # fallback to the text scanner
252
+ unless scanner == LangScan::Text
253
+ vprintf("#{@normalized_path}: fallback to LangScan::Text")
254
+ scanner = LangScan::Text
255
+ retry
256
+ end
257
+ end
258
+ fragments.each {|fragment| add_fragment(fragment) }
259
+ flush_cache
260
+ @dbm.increase_counter(:ncontents_indexed)
261
+ @dbm.increase_counter(:nlines_indexed, @nlines)
262
+ @indexed_p = true
263
+ end
264
+
265
+ def add_content_common(format_abbrev, format_name)
266
+ add_format(format_abbrev, format_name)
267
+ add_license
268
+ @dbm.pathid_pkgid[@path_id] = @package_id
269
+ @dbm.pathid_content[@path_id] = @content.text
270
+ @dbm.pathid_info[@path_id] = make_content_info
271
+ @dbm.pathid_hash[@path_id] = @content_hash
272
+ vprintf("added (%s): %s (%s)", format_abbrev,
273
+ @normalized_path, @license_abbrev)
274
+ end
275
+
276
+ def add_binary_content
277
+ add_content_common("binary", "Binary")
278
+ end
279
+
280
+ def make_scanner
281
+ scanner = LangScan.choose(@normalized_path, @content.text)
282
+ scanner = LangScan::Text if scanner.nil?
283
+ return scanner
284
+ end
285
+
286
+ def indexable?(scanner)
287
+ not @noindex_formats.include?(scanner.abbrev)
288
+ end
289
+
290
+ def add_content_with_indexing
291
+ scanner = make_scanner
292
+ if indexable?(scanner)
293
+ index_content(scanner)
294
+ else
295
+ vprintf("skip indexing: %s", @normalized_path)
296
+ end
297
+ add_content_common(scanner.abbrev, scanner.name)
298
+ end
299
+
300
+ def add_content
301
+ if @encoding == "binary"
302
+ add_binary_content
303
+ else
304
+ add_content_with_indexing
305
+ end
306
+ end
307
+
308
+ public
309
+ def index
310
+ read_content
311
+ add_package_if_necessary
312
+ add_path
313
+ add_content
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,80 @@
1
+ #
2
+ # info.rb - information classes
3
+ #
4
+ # Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ module Gonzui
13
+ module BytenoMixin
14
+ def end_byteno
15
+ byteno + length
16
+ end
17
+
18
+ def range
19
+ byteno ... (byteno + length)
20
+ end
21
+ end
22
+
23
+ WordInfo = Struct.new(:word_id, :path_id,
24
+ :seqno, :byteno, :type_id, :type, :lineno)
25
+ class WordInfo
26
+ include BytenoMixin
27
+
28
+ # dump info
29
+ DeltaSize = 2
30
+ UnitSize = 3
31
+
32
+ def match?(target_type)
33
+ target_type == :all or target_type == self.type
34
+ end
35
+ end
36
+
37
+ DigestInfo = Struct.new(:byteno, :length, :type_id, :type)
38
+ class DigestInfo
39
+ include BytenoMixin
40
+
41
+ # dump info
42
+ DeltaSize = 1
43
+ UnitSize = 3
44
+ end
45
+
46
+ ContentInfo = Struct.new(:size, :mtime, :itime,
47
+ :format_id, :license_id,
48
+ :nlines, :indexed_p)
49
+ class ContentInfo
50
+ extend Util
51
+ PACK_FORMAT = "w*"
52
+
53
+ def self.load(dump)
54
+ info = self.new(*dump.unpack(PACK_FORMAT))
55
+ info.indexed_p = if info.indexed_p == 1 then true else false end
56
+ return info
57
+ end
58
+
59
+ def self.dump(size, mtime, itime, format_id,
60
+ license_id, nlines, indexed_p)
61
+ indexed_p = if indexed_p then 1 else 0 end
62
+ # FIXME: It could happen for some cases.
63
+ if mtime < 0
64
+ vprintf("minus mtime found: %d", mtime)
65
+ mtime = Time.now.to_i
66
+ end
67
+ [size, mtime, itime, format_id,
68
+ license_id, nlines, indexed_p].pack(PACK_FORMAT)
69
+ end
70
+
71
+ def indexed?
72
+ self.indexed_p
73
+ end
74
+ end
75
+
76
+ Occurrence = Struct.new(:byteno, :lineno, :length)
77
+ class Occurrence
78
+ include BytenoMixin
79
+ end
80
+ end