gonzui 1.2-x86-mswin32-60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. data/AUTHORS.txt +9 -0
  2. data/History.txt +5539 -0
  3. data/Manifest.txt +115 -0
  4. data/PostInstall.txt +17 -0
  5. data/README.rdoc +149 -0
  6. data/Rakefile +28 -0
  7. data/bin/gonzui-db +167 -0
  8. data/bin/gonzui-import +177 -0
  9. data/bin/gonzui-remove +58 -0
  10. data/bin/gonzui-search +68 -0
  11. data/bin/gonzui-server +176 -0
  12. data/bin/gonzui-update +53 -0
  13. data/data/gonzui/catalog/catalog.ja +80 -0
  14. data/data/gonzui/doc/favicon.ico +0 -0
  15. data/data/gonzui/doc/folder.png +0 -0
  16. data/data/gonzui/doc/gonzui.css +279 -0
  17. data/data/gonzui/doc/gonzui.js +111 -0
  18. data/data/gonzui/doc/text.png +0 -0
  19. data/data/gonzuirc.sample +29 -0
  20. data/ext/autopack/autopack.c +88 -0
  21. data/ext/autopack/extconf.rb +3 -0
  22. data/ext/delta/delta.c +147 -0
  23. data/ext/delta/extconf.rb +5 -0
  24. data/ext/texttokenizer/extconf.rb +5 -0
  25. data/ext/texttokenizer/texttokenizer.c +93 -0
  26. data/ext/xmlformatter/extconf.rb +5 -0
  27. data/ext/xmlformatter/xmlformatter.c +207 -0
  28. data/lib/gonzui.rb +59 -0
  29. data/lib/gonzui/apt.rb +193 -0
  30. data/lib/gonzui/autopack.so +0 -0
  31. data/lib/gonzui/bdbdbm.rb +118 -0
  32. data/lib/gonzui/cmdapp.rb +14 -0
  33. data/lib/gonzui/cmdapp/app.rb +175 -0
  34. data/lib/gonzui/cmdapp/search.rb +134 -0
  35. data/lib/gonzui/config.rb +117 -0
  36. data/lib/gonzui/content.rb +19 -0
  37. data/lib/gonzui/dbm.rb +673 -0
  38. data/lib/gonzui/deindexer.rb +162 -0
  39. data/lib/gonzui/delta.rb +49 -0
  40. data/lib/gonzui/delta.so +0 -0
  41. data/lib/gonzui/extractor.rb +347 -0
  42. data/lib/gonzui/fetcher.rb +309 -0
  43. data/lib/gonzui/gettext.rb +144 -0
  44. data/lib/gonzui/importer.rb +84 -0
  45. data/lib/gonzui/indexer.rb +316 -0
  46. data/lib/gonzui/info.rb +80 -0
  47. data/lib/gonzui/license.rb +100 -0
  48. data/lib/gonzui/logger.rb +48 -0
  49. data/lib/gonzui/monitor.rb +177 -0
  50. data/lib/gonzui/progressbar.rb +235 -0
  51. data/lib/gonzui/remover.rb +38 -0
  52. data/lib/gonzui/searcher.rb +330 -0
  53. data/lib/gonzui/searchquery.rb +235 -0
  54. data/lib/gonzui/searchresult.rb +111 -0
  55. data/lib/gonzui/texttokenizer.so +0 -0
  56. data/lib/gonzui/updater.rb +254 -0
  57. data/lib/gonzui/util.rb +415 -0
  58. data/lib/gonzui/vcs.rb +128 -0
  59. data/lib/gonzui/webapp.rb +25 -0
  60. data/lib/gonzui/webapp/advsearch.rb +123 -0
  61. data/lib/gonzui/webapp/filehandler.rb +24 -0
  62. data/lib/gonzui/webapp/jsfeed.rb +61 -0
  63. data/lib/gonzui/webapp/markup.rb +445 -0
  64. data/lib/gonzui/webapp/search.rb +269 -0
  65. data/lib/gonzui/webapp/servlet.rb +319 -0
  66. data/lib/gonzui/webapp/snippet.rb +155 -0
  67. data/lib/gonzui/webapp/source.rb +37 -0
  68. data/lib/gonzui/webapp/stat.rb +137 -0
  69. data/lib/gonzui/webapp/top.rb +63 -0
  70. data/lib/gonzui/webapp/uri.rb +140 -0
  71. data/lib/gonzui/webapp/webrick.rb +48 -0
  72. data/lib/gonzui/webapp/xmlformatter.so +0 -0
  73. data/script/console +10 -0
  74. data/script/destroy +14 -0
  75. data/script/generate +14 -0
  76. data/script/makemanifest.rb +21 -0
  77. data/tasks/extconf.rake +13 -0
  78. data/tasks/extconf/autopack.rake +43 -0
  79. data/tasks/extconf/delta.rake +43 -0
  80. data/tasks/extconf/texttokenizer.rake +43 -0
  81. data/tasks/extconf/xmlformatter.rake +43 -0
  82. data/test/_external_tools.rb +13 -0
  83. data/test/_test-util.rb +142 -0
  84. data/test/foo/Makefile.foo +66 -0
  85. data/test/foo/bar.c +5 -0
  86. data/test/foo/bar.h +6 -0
  87. data/test/foo/foo.c +25 -0
  88. data/test/foo/foo.spec +33 -0
  89. data/test/test_apt.rb +42 -0
  90. data/test/test_autopack_extn.rb +7 -0
  91. data/test/test_bdbdbm.rb +79 -0
  92. data/test/test_cmdapp-app.rb +35 -0
  93. data/test/test_cmdapp-search.rb +99 -0
  94. data/test/test_config.rb +28 -0
  95. data/test/test_content.rb +15 -0
  96. data/test/test_dbm.rb +171 -0
  97. data/test/test_deindexer.rb +50 -0
  98. data/test/test_delta.rb +66 -0
  99. data/test/test_extractor.rb +78 -0
  100. data/test/test_fetcher.rb +75 -0
  101. data/test/test_gettext.rb +50 -0
  102. data/test/test_gonzui.rb +11 -0
  103. data/test/test_helper.rb +10 -0
  104. data/test/test_importer.rb +56 -0
  105. data/test/test_indexer.rb +37 -0
  106. data/test/test_info.rb +82 -0
  107. data/test/test_license.rb +49 -0
  108. data/test/test_logger.rb +60 -0
  109. data/test/test_monitor.rb +23 -0
  110. data/test/test_searcher.rb +37 -0
  111. data/test/test_searchquery.rb +27 -0
  112. data/test/test_searchresult.rb +43 -0
  113. data/test/test_texttokenizer.rb +47 -0
  114. data/test/test_updater.rb +95 -0
  115. data/test/test_util.rb +149 -0
  116. data/test/test_vcs.rb +61 -0
  117. data/test/test_webapp-markup.rb +42 -0
  118. data/test/test_webapp-util.rb +19 -0
  119. data/test/test_webapp-xmlformatter.rb +19 -0
  120. metadata +292 -0
@@ -0,0 +1,84 @@
1
+ #
2
+ # importer.rb - import contents to gonzui.db
3
+ #
4
+ # Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ require 'uri'
13
+
14
+ module Gonzui
15
+ class ImporterError < GonzuiError; end
16
+
17
+ class Importer < AbstractUpdater
18
+ def initialize(config, options = {})
19
+ super(config, options)
20
+ # to be initialized
21
+ @last_package_name = nil
22
+ end
23
+ attr_reader :last_package_name
24
+
25
+ private
26
+ def import_package(fetcher, source_uri)
27
+ package_name = fetcher.package_name
28
+ raise ImporterError.new("#{package_name}: already exists") if
29
+ @dbm.has_package?(package_name)
30
+
31
+ relative_paths = fetcher.collect
32
+ pbar = make_progress_bar(package_name, relative_paths.length)
33
+ begin
34
+ relative_paths.each {|relative_path|
35
+ begin
36
+ normalized_path = File.join(package_name, relative_path)
37
+ content = nil
38
+ begin
39
+ content = fetcher.fetch(relative_path)
40
+ rescue => e
41
+ vprintf("fetch failed: %s: %s\n%s", relative_path, e.message)
42
+ next
43
+ end
44
+ index_content(source_uri, normalized_path, content)
45
+ ensure
46
+ pbar.inc
47
+ end
48
+ }
49
+ ensure
50
+ @dbm.flush_cache
51
+ end
52
+ pbar.finish
53
+ @npackages += 1
54
+ @last_package_name = package_name
55
+ end
56
+
57
+ def do_task_name
58
+ "imported"
59
+ end
60
+
61
+ public
62
+ def import(source_uri)
63
+ fetcher = Fetcher.new(@config, source_uri)
64
+ begin
65
+ import_package(fetcher, source_uri)
66
+ ensure
67
+ fetcher.finish
68
+ end
69
+ end
70
+
71
+ def summary
72
+ summary = super
73
+ if @config.verbose
74
+ stat = Indexer.statistics
75
+ summary += "\n" + stat unless stat.empty?
76
+ end
77
+ return summary
78
+ end
79
+
80
+ def finish
81
+ @dbm.close
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,316 @@
1
+ #
2
+ # indexer.rb - indexer implementation
3
+ #
4
+ # Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ require 'ftools'
13
+ require 'digest/md5'
14
+ require 'langscan'
15
+
16
+ module Gonzui
17
+ class IndexerError < GonzuiError; end
18
+
19
+ class Indexer
20
+ include Util
21
+
22
+ @@performance_monitor = PerformanceMonitor.new
23
+
24
+ def self.statistics
25
+ return "" if @@performance_monitor.empty?
26
+ pm = @@performance_monitor
27
+ summary = "Performance statistics:\n"
28
+ summary << pm.heading
29
+ summary << pm.format([Indexer, :index],
30
+ [Indexer, :read_content],
31
+ [Indexer, :add_license],
32
+ [Indexer, :index_content])
33
+ labels = LangScan.modules.map {|m|
34
+ [m, :scan]
35
+ }.push([Indexer, :add_fragment],
36
+ [Indexer, :flush_cache])
37
+ summary << pm.format([Indexer, :index_content], *labels)
38
+ return summary
39
+ end
40
+
41
+ def initialize(config, dbm, source_uri, normalized_path, content,
42
+ options = {})
43
+ @config = config
44
+ @dbm = dbm
45
+ @normalized_path = normalized_path
46
+ @source_uri = source_uri
47
+ @content = content
48
+ @content_hash = Digest::MD5.hexdigest(content.text)
49
+ @noindex_formats = (options[:noindex_formats] or @config.noindex_formats)
50
+
51
+ @package_name = get_package_name
52
+ @seqno = 0
53
+
54
+ @word_cache = {}
55
+ @wordinfo_cache = {}
56
+ @digest_cache = []
57
+
58
+ # to be initialized
59
+ @format_id = nil
60
+ @license_id = nil
61
+ @license_abbrev = nil
62
+ @encoding = nil
63
+ @nlines = nil
64
+ @package_id = nil
65
+ @path_id = nil
66
+ @bols = [] # positions of beginning of lines
67
+ @indexed_p = false
68
+
69
+ initialize_profilers_if_necessary
70
+ end
71
+
72
+ def initialize_profilers_if_necessary
73
+ # profiler
74
+ if @config.verbose
75
+ @@performance_monitor.profile(Indexer, :index)
76
+ @@performance_monitor.profile(Indexer, :read_content)
77
+ @@performance_monitor.profile(Indexer, :index_content)
78
+ @@performance_monitor.profile(Indexer, :add_fragment)
79
+ @@performance_monitor.profile(Indexer, :add_license)
80
+ @@performance_monitor.profile(Indexer, :flush_cache)
81
+ end
82
+ end
83
+
84
+ def read_content
85
+ content, @encoding = normalize_content(@content.text)
86
+ @content.text = content
87
+ @nlines = 0
88
+ pos = 0
89
+ @content.text.each_line {|line|
90
+ @bols.push(pos)
91
+ @nlines += 1
92
+ pos += line.length
93
+ }
94
+ end
95
+
96
+ # allow 0x09 (TAB), 0x0a (LF), 0x0c(^L), 0x0d (CR) 0x1b (ESC)
97
+ allowed = [0x09, 0x0a, 0x0c, 0x0d, 0x1b]
98
+ pattern = "["
99
+ pattern << (0...0x20).find_all {|x|
100
+ not allowed.include?(x)
101
+ }.map {|x| sprintf("\\x%02x", x) }.join
102
+ pattern << "]"
103
+ BinaryRegexp = Regexp.new(pattern)
104
+
105
+ def binary_content?(content)
106
+ BinaryRegexp.match(content)
107
+ end
108
+
109
+ def convert_to_utf8(content)
110
+ encoding = "ascii"
111
+ if binary_content?(content)
112
+ encoding = "binary"
113
+ else
114
+ if @config.utf8
115
+ content, encoding = UTF8.to_utf8(content)
116
+ end
117
+ end
118
+ return content, encoding
119
+ end
120
+
121
+ def normalize_content(content)
122
+ content, encoding = convert_to_utf8(content)
123
+ unless encoding == "binary"
124
+ content = content.untabify
125
+ content.gsub!(/\r\n?/, "\n")
126
+ end
127
+ return content, encoding
128
+ end
129
+
130
+ def get_package_name
131
+ parts = @normalized_path.split("/")
132
+ if parts.length < 2
133
+ raise IndexerError.new("normalized path should not be flat")
134
+ end
135
+ package_name = parts.first
136
+ if package_name.size == 0 || package_name == "." || package_name == ".."
137
+ package_name = File.basename(@source_uri.path)
138
+ end
139
+ return package_name
140
+ end
141
+
142
+ def add_text(fragment, type_id)
143
+ text = fragment.text
144
+ byteno = fragment.byteno
145
+ TextTokenizer.each_word(text) {|word, pos|
146
+ add_word(word, byteno + pos, type_id)
147
+ }
148
+ end
149
+
150
+ def add_fragment(fragment)
151
+ type_id = @dbm.get_type_id(fragment.type)
152
+ if LangScan::Type.splittable?(fragment.type)
153
+ add_text(fragment, type_id)
154
+ else
155
+ add_word(fragment.text, fragment.byteno, type_id)
156
+ end
157
+
158
+ @digest_cache.push(fragment.byteno, fragment.text.length, type_id)
159
+ end
160
+
161
+ def flush_cache
162
+ all_word_ids = @wordinfo_cache.keys.sort!
163
+ all_word_ids.each {|word_id|
164
+ path_word_id = AutoPack.pack_id2(@path_id, word_id)
165
+ @dbm.pathwordid_info[path_word_id] =
166
+ DeltaDumper.dump_tuples(WordInfo, @wordinfo_cache[word_id])
167
+ }
168
+ @dbm.put_pathid_wordids(@package_id, @path_id, all_word_ids)
169
+ @dbm.pathid_wordids[@path_id] = DeltaDumper.dump_ids(all_word_ids)
170
+ @dbm.pathid_digest[@path_id] =
171
+ DeltaDumper.dump_tuples(DigestInfo, @digest_cache)
172
+ @dbm.pathid_bols[@path_id] = DeltaDumper.dump_fixnums(@bols)
173
+ @wordinfo_cache.clear
174
+ @dbm.word_id_counter.flush
175
+ end
176
+
177
+ def add_property(abbrev, name, counter, make_key, pkgid_ids)
178
+ id = @dbm.send(counter).get_id2(abbrev, name)
179
+ @dbm.send(pkgid_ids)[@package_id] = id
180
+ @dbm.increase_counter(@dbm.send(make_key, abbrev))
181
+ return id
182
+ end
183
+
184
+ def add_format(format_abbrev, format_name)
185
+ @format_id = add_property(format_abbrev,
186
+ format_name,
187
+ :format_id_counter,
188
+ :make_ncontents_by_format_key,
189
+ :pkgid_fmtids)
190
+ end
191
+
192
+ def add_license
193
+ detector = LicenseDetector.new(@content.text)
194
+ license = detector.detect
195
+ @license_id = add_property(license.abbrev,
196
+ license.name,
197
+ :license_id_counter,
198
+ :make_ncontents_by_license_key,
199
+ :pkgid_lcsids)
200
+ @license_abbrev = license.abbrev
201
+ end
202
+
203
+ def add_path
204
+ assert_equal(false, @dbm.path_pathid.include?(@normalized_path))
205
+ @path_id = @dbm.path_id_counter.make_new_id
206
+ @dbm.path_pathid[@normalized_path] = @path_id
207
+ @dbm.pathid_path[@path_id] = @normalized_path
208
+ @dbm.pkgid_pathids[@package_id] = @path_id
209
+ end
210
+
211
+ def get_fragments(scanner)
212
+ @@performance_monitor.profile(scanner, :scan) if @config.verbose
213
+ fragments = []
214
+ scanner.scan(@content.text) {|fragment|
215
+ fragments.push(fragment) if LangScan::Type.include?(fragment.type)
216
+ }
217
+ fragments = fragments.sort_by {|fragment| fragment.byteno }
218
+ return fragments
219
+ end
220
+
221
+ def add_word(word, byteno, type_id)
222
+ word_id = @dbm.word_id_counter.get_id(word)
223
+ array = (@wordinfo_cache[word_id] ||= [])
224
+ array.push(@seqno, byteno, type_id)
225
+ @seqno += 1
226
+ end
227
+
228
+ def add_package_if_necessary
229
+ if @dbm.has_package?(@package_name)
230
+ @package_id = @dbm.get_package_id(@package_name)
231
+ else
232
+ @package_id = @dbm.package_id_counter.make_new_id
233
+ @dbm.pkg_pkgid[@package_name] = @package_id
234
+ @dbm.pkgid_pkg[@package_id] = @package_name
235
+ @dbm.pkgid_src[@package_id] = @source_uri.to_s
236
+ @dbm.put_package_options(@package_id)
237
+ end
238
+ end
239
+
240
+ def make_content_info
241
+ ContentInfo.dump(@content.length, @content.mtime.to_i,
242
+ Time.now.to_i, @format_id, @license_id,
243
+ @nlines, @indexed_p)
244
+ end
245
+
246
+ def index_content(scanner)
247
+ fragments = []
248
+ begin
249
+ fragments = get_fragments(scanner)
250
+ rescue
251
+ # fallback to the text scanner
252
+ unless scanner == LangScan::Text
253
+ vprintf("#{@normalized_path}: fallback to LangScan::Text")
254
+ scanner = LangScan::Text
255
+ retry
256
+ end
257
+ end
258
+ fragments.each {|fragment| add_fragment(fragment) }
259
+ flush_cache
260
+ @dbm.increase_counter(:ncontents_indexed)
261
+ @dbm.increase_counter(:nlines_indexed, @nlines)
262
+ @indexed_p = true
263
+ end
264
+
265
+ def add_content_common(format_abbrev, format_name)
266
+ add_format(format_abbrev, format_name)
267
+ add_license
268
+ @dbm.pathid_pkgid[@path_id] = @package_id
269
+ @dbm.pathid_content[@path_id] = @content.text
270
+ @dbm.pathid_info[@path_id] = make_content_info
271
+ @dbm.pathid_hash[@path_id] = @content_hash
272
+ vprintf("added (%s): %s (%s)", format_abbrev,
273
+ @normalized_path, @license_abbrev)
274
+ end
275
+
276
+ def add_binary_content
277
+ add_content_common("binary", "Binary")
278
+ end
279
+
280
+ def make_scanner
281
+ scanner = LangScan.choose(@normalized_path, @content.text)
282
+ scanner = LangScan::Text if scanner.nil?
283
+ return scanner
284
+ end
285
+
286
+ def indexable?(scanner)
287
+ not @noindex_formats.include?(scanner.abbrev)
288
+ end
289
+
290
+ def add_content_with_indexing
291
+ scanner = make_scanner
292
+ if indexable?(scanner)
293
+ index_content(scanner)
294
+ else
295
+ vprintf("skip indexing: %s", @normalized_path)
296
+ end
297
+ add_content_common(scanner.abbrev, scanner.name)
298
+ end
299
+
300
+ def add_content
301
+ if @encoding == "binary"
302
+ add_binary_content
303
+ else
304
+ add_content_with_indexing
305
+ end
306
+ end
307
+
308
+ public
309
+ def index
310
+ read_content
311
+ add_package_if_necessary
312
+ add_path
313
+ add_content
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,80 @@
1
+ #
2
+ # info.rb - information classes
3
+ #
4
+ # Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ module Gonzui
13
+ module BytenoMixin
14
+ def end_byteno
15
+ byteno + length
16
+ end
17
+
18
+ def range
19
+ byteno ... (byteno + length)
20
+ end
21
+ end
22
+
23
+ WordInfo = Struct.new(:word_id, :path_id,
24
+ :seqno, :byteno, :type_id, :type, :lineno)
25
+ class WordInfo
26
+ include BytenoMixin
27
+
28
+ # dump info
29
+ DeltaSize = 2
30
+ UnitSize = 3
31
+
32
+ def match?(target_type)
33
+ target_type == :all or target_type == self.type
34
+ end
35
+ end
36
+
37
+ DigestInfo = Struct.new(:byteno, :length, :type_id, :type)
38
+ class DigestInfo
39
+ include BytenoMixin
40
+
41
+ # dump info
42
+ DeltaSize = 1
43
+ UnitSize = 3
44
+ end
45
+
46
+ ContentInfo = Struct.new(:size, :mtime, :itime,
47
+ :format_id, :license_id,
48
+ :nlines, :indexed_p)
49
+ class ContentInfo
50
+ extend Util
51
+ PACK_FORMAT = "w*"
52
+
53
+ def self.load(dump)
54
+ info = self.new(*dump.unpack(PACK_FORMAT))
55
+ info.indexed_p = if info.indexed_p == 1 then true else false end
56
+ return info
57
+ end
58
+
59
+ def self.dump(size, mtime, itime, format_id,
60
+ license_id, nlines, indexed_p)
61
+ indexed_p = if indexed_p then 1 else 0 end
62
+ # FIXME: It could happen for some cases.
63
+ if mtime < 0
64
+ vprintf("minus mtime found: %d", mtime)
65
+ mtime = Time.now.to_i
66
+ end
67
+ [size, mtime, itime, format_id,
68
+ license_id, nlines, indexed_p].pack(PACK_FORMAT)
69
+ end
70
+
71
+ def indexed?
72
+ self.indexed_p
73
+ end
74
+ end
75
+
76
+ Occurrence = Struct.new(:byteno, :lineno, :length)
77
+ class Occurrence
78
+ include BytenoMixin
79
+ end
80
+ end