RubyGems - gonzui - Versions diffs - 1.2 - Mend

gonzui 1.2

Files changed (116) hide show

data/AUTHORS.txt +9 -0
data/History.txt +5539 -0
data/Manifest.txt +115 -0
data/PostInstall.txt +17 -0
data/README.rdoc +149 -0
data/Rakefile +28 -0
data/bin/gonzui-db +167 -0
data/bin/gonzui-import +177 -0
data/bin/gonzui-remove +58 -0
data/bin/gonzui-search +68 -0
data/bin/gonzui-server +176 -0
data/bin/gonzui-update +53 -0
data/data/gonzui/catalog/catalog.ja +80 -0
data/data/gonzui/doc/favicon.ico +0 -0
data/data/gonzui/doc/folder.png +0 -0
data/data/gonzui/doc/gonzui.css +279 -0
data/data/gonzui/doc/gonzui.js +111 -0
data/data/gonzui/doc/text.png +0 -0
data/data/gonzuirc.sample +29 -0
data/ext/autopack/autopack.c +88 -0
data/ext/autopack/extconf.rb +3 -0
data/ext/delta/delta.c +147 -0
data/ext/delta/extconf.rb +5 -0
data/ext/texttokenizer/extconf.rb +5 -0
data/ext/texttokenizer/texttokenizer.c +93 -0
data/ext/xmlformatter/extconf.rb +5 -0
data/ext/xmlformatter/xmlformatter.c +207 -0
data/lib/gonzui.rb +59 -0
data/lib/gonzui/apt.rb +193 -0
data/lib/gonzui/bdbdbm.rb +118 -0
data/lib/gonzui/cmdapp.rb +14 -0
data/lib/gonzui/cmdapp/app.rb +175 -0
data/lib/gonzui/cmdapp/search.rb +134 -0
data/lib/gonzui/config.rb +117 -0
data/lib/gonzui/content.rb +19 -0
data/lib/gonzui/dbm.rb +673 -0
data/lib/gonzui/deindexer.rb +162 -0
data/lib/gonzui/delta.rb +49 -0
data/lib/gonzui/extractor.rb +347 -0
data/lib/gonzui/fetcher.rb +309 -0
data/lib/gonzui/gettext.rb +144 -0
data/lib/gonzui/importer.rb +84 -0
data/lib/gonzui/indexer.rb +316 -0
data/lib/gonzui/info.rb +80 -0
data/lib/gonzui/license.rb +100 -0
data/lib/gonzui/logger.rb +48 -0
data/lib/gonzui/monitor.rb +177 -0
data/lib/gonzui/progressbar.rb +235 -0
data/lib/gonzui/remover.rb +38 -0
data/lib/gonzui/searcher.rb +330 -0
data/lib/gonzui/searchquery.rb +235 -0
data/lib/gonzui/searchresult.rb +111 -0
data/lib/gonzui/updater.rb +254 -0
data/lib/gonzui/util.rb +415 -0
data/lib/gonzui/vcs.rb +128 -0
data/lib/gonzui/webapp.rb +25 -0
data/lib/gonzui/webapp/advsearch.rb +123 -0
data/lib/gonzui/webapp/filehandler.rb +24 -0
data/lib/gonzui/webapp/jsfeed.rb +61 -0
data/lib/gonzui/webapp/markup.rb +445 -0
data/lib/gonzui/webapp/search.rb +269 -0
data/lib/gonzui/webapp/servlet.rb +319 -0
data/lib/gonzui/webapp/snippet.rb +155 -0
data/lib/gonzui/webapp/source.rb +37 -0
data/lib/gonzui/webapp/stat.rb +137 -0
data/lib/gonzui/webapp/top.rb +63 -0
data/lib/gonzui/webapp/uri.rb +140 -0
data/lib/gonzui/webapp/webrick.rb +48 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/script/makemanifest.rb +21 -0
data/tasks/extconf.rake +13 -0
data/tasks/extconf/autopack.rake +43 -0
data/tasks/extconf/delta.rake +43 -0
data/tasks/extconf/texttokenizer.rake +43 -0
data/tasks/extconf/xmlformatter.rake +43 -0
data/test/_external_tools.rb +13 -0
data/test/_test-util.rb +142 -0
data/test/foo/Makefile.foo +66 -0
data/test/foo/bar.c +5 -0
data/test/foo/bar.h +6 -0
data/test/foo/foo.c +25 -0
data/test/foo/foo.spec +33 -0
data/test/test_apt.rb +42 -0
data/test/test_autopack_extn.rb +7 -0
data/test/test_bdbdbm.rb +79 -0
data/test/test_cmdapp-app.rb +35 -0
data/test/test_cmdapp-search.rb +99 -0
data/test/test_config.rb +28 -0
data/test/test_content.rb +15 -0
data/test/test_dbm.rb +171 -0
data/test/test_deindexer.rb +50 -0
data/test/test_delta.rb +66 -0
data/test/test_extractor.rb +78 -0
data/test/test_fetcher.rb +75 -0
data/test/test_gettext.rb +50 -0
data/test/test_gonzui.rb +11 -0
data/test/test_helper.rb +10 -0
data/test/test_importer.rb +56 -0
data/test/test_indexer.rb +37 -0
data/test/test_info.rb +82 -0
data/test/test_license.rb +49 -0
data/test/test_logger.rb +60 -0
data/test/test_monitor.rb +23 -0
data/test/test_searcher.rb +37 -0
data/test/test_searchquery.rb +27 -0
data/test/test_searchresult.rb +43 -0
data/test/test_texttokenizer.rb +47 -0
data/test/test_updater.rb +95 -0
data/test/test_util.rb +149 -0
data/test/test_vcs.rb +61 -0
data/test/test_webapp-markup.rb +42 -0
data/test/test_webapp-util.rb +19 -0
data/test/test_webapp-xmlformatter.rb +19 -0
metadata +291 -0

@@ -0,0 +1,84 @@
+#
+# importer.rb - import contents to gonzui.db
+#
+# Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
+#     All rights reserved.
+#     This is free software with ABSOLUTELY NO WARRANTY.
+#
+# You can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2.
+#
+require 'uri'
+module Gonzui
+  class ImporterError < GonzuiError; end
+  class Importer < AbstractUpdater
+    def initialize(config, options = {})
+      super(config, options)
+      # to be initialized
+      @last_package_name = nil
+    end
+    attr_reader :last_package_name
+    private
+    def import_package(fetcher, source_uri)
+      package_name = fetcher.package_name
+      raise ImporterError.new("#{package_name}: already exists") if
+        @dbm.has_package?(package_name)
+      relative_paths = fetcher.collect
+      pbar = make_progress_bar(package_name, relative_paths.length)
+      begin
+        relative_paths.each {|relative_path|
+          begin
+            normalized_path = File.join(package_name, relative_path)
+            content = nil
+            begin
+              content = fetcher.fetch(relative_path)
+            rescue => e
+              vprintf("fetch failed: %s: %s\n%s", relative_path, e.message)
+              next
+            end
+            index_content(source_uri, normalized_path, content)
+          ensure
+            pbar.inc
+          end
+        }
+      ensure
+        @dbm.flush_cache
+      end
+      pbar.finish
+      @npackages += 1
+      @last_package_name = package_name
+    end
+    def do_task_name
+      "imported"
+    end
+    public
+    def import(source_uri)
+      fetcher = Fetcher.new(@config, source_uri)
+      begin
+        import_package(fetcher, source_uri)
+      ensure
+        fetcher.finish
+      end
+    end
+    def summary
+      summary = super
+      if @config.verbose
+        stat = Indexer.statistics
+        summary += "\n" + stat unless stat.empty?
+      end
+      return summary
+    end
+    def finish
+      @dbm.close
+    end
+  end
+end

data/lib/gonzui/indexer.rb ADDED

@@ -0,0 +1,316 @@
+#
+# indexer.rb - indexer implementation
+#
+# Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
+#     All rights reserved.
+#     This is free software with ABSOLUTELY NO WARRANTY.
+#
+# You can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2.
+#
+require 'ftools'
+require 'digest/md5'
+require 'langscan'
+module Gonzui
+  class IndexerError < GonzuiError; end
+  class Indexer
+    include Util
+    @@performance_monitor = PerformanceMonitor.new
+    def self.statistics
+      return "" if @@performance_monitor.empty?
+      pm = @@performance_monitor
+      summary = "Performance statistics:\n"
+      summary << pm.heading
+      summary << pm.format([Indexer, :index],
+                           [Indexer, :read_content],
+                           [Indexer, :add_license],
+                           [Indexer, :index_content])
+      labels = LangScan.modules.map {|m|
+        [m, :scan]
+      }.push([Indexer, :add_fragment],
+             [Indexer, :flush_cache])
+      summary << pm.format([Indexer, :index_content], *labels)
+      return summary
+    end
+    def initialize(config, dbm, source_uri, normalized_path, content,
+                   options = {})
+      @config = config
+      @dbm = dbm
+      @normalized_path = normalized_path
+      @source_uri = source_uri
+      @content = content
+      @content_hash = Digest::MD5.hexdigest(content.text)
+      @noindex_formats = (options[:noindex_formats] or @config.noindex_formats)
+      @package_name = get_package_name
+      @seqno      = 0
+      @word_cache = {}
+      @wordinfo_cache = {}
+      @digest_cache = []
+      # to be initialized
+      @format_id  = nil
+      @license_id = nil
+      @license_abbrev = nil
+      @encoding = nil
+      @nlines = nil
+      @package_id = nil
+      @path_id = nil
+      @bols = [] # positions of beginning of lines
+      @indexed_p = false
+      initialize_profilers_if_necessary
+    end
+    def initialize_profilers_if_necessary
+      # profiler
+      if @config.verbose
+        @@performance_monitor.profile(Indexer, :index)
+        @@performance_monitor.profile(Indexer, :read_content)
+        @@performance_monitor.profile(Indexer, :index_content)
+        @@performance_monitor.profile(Indexer, :add_fragment)
+        @@performance_monitor.profile(Indexer, :add_license)
+        @@performance_monitor.profile(Indexer, :flush_cache)
+      end
+    end
+    def read_content
+      content, @encoding = normalize_content(@content.text)
+      @content.text = content
+      @nlines = 0
+      pos = 0
+      @content.text.each_line {|line|
+        @bols.push(pos)
+        @nlines += 1
+        pos += line.length
+      }
+    end
+    # allow 0x09 (TAB), 0x0a (LF), 0x0c(^L), 0x0d (CR) 0x1b (ESC)
+    allowed = [0x09, 0x0a, 0x0c, 0x0d, 0x1b]
+    pattern = "["
+    pattern << (0...0x20).find_all {|x|
+      not allowed.include?(x)
+    }.map {|x| sprintf("\\x%02x", x) }.join
+    pattern << "]"
+    BinaryRegexp = Regexp.new(pattern)
+    def binary_content?(content)
+      BinaryRegexp.match(content)
+    end
+    def convert_to_utf8(content)
+      encoding = "ascii"
+      if binary_content?(content)
+        encoding = "binary"
+      else
+        if @config.utf8
+          content, encoding = UTF8.to_utf8(content)
+        end
+      end
+      return content, encoding
+    end
+    def normalize_content(content)
+      content, encoding = convert_to_utf8(content)
+      unless encoding == "binary"
+        content = content.untabify
+        content.gsub!(/\r\n?/, "\n")
+      end
+      return content, encoding
+    end
+    def get_package_name
+      parts = @normalized_path.split("/")
+      if parts.length < 2
+        raise IndexerError.new("normalized path should not be flat")
+      end
+      package_name = parts.first
+      if package_name.size == 0 || package_name == "." || package_name == ".."
+        package_name = File.basename(@source_uri.path)
+      end
+      return package_name
+    end
+    def add_text(fragment, type_id)
+      text = fragment.text
+      byteno = fragment.byteno
+      TextTokenizer.each_word(text) {|word, pos|
+        add_word(word, byteno + pos, type_id)
+      }
+    end
+    def add_fragment(fragment)
+      type_id = @dbm.get_type_id(fragment.type)
+      if LangScan::Type.splittable?(fragment.type)
+        add_text(fragment, type_id)
+      else
+        add_word(fragment.text, fragment.byteno, type_id)
+      end
+      @digest_cache.push(fragment.byteno, fragment.text.length, type_id)
+    end
+    def flush_cache
+      all_word_ids = @wordinfo_cache.keys.sort!
+      all_word_ids.each {|word_id|
+        path_word_id = AutoPack.pack_id2(@path_id, word_id)
+        @dbm.pathwordid_info[path_word_id] =
+          DeltaDumper.dump_tuples(WordInfo, @wordinfo_cache[word_id])
+      }
+      @dbm.put_pathid_wordids(@package_id, @path_id, all_word_ids)
+      @dbm.pathid_wordids[@path_id] = DeltaDumper.dump_ids(all_word_ids)
+      @dbm.pathid_digest[@path_id] =
+        DeltaDumper.dump_tuples(DigestInfo, @digest_cache)
+      @dbm.pathid_bols[@path_id] = DeltaDumper.dump_fixnums(@bols)
+      @wordinfo_cache.clear
+      @dbm.word_id_counter.flush
+    end
+    def add_property(abbrev, name, counter, make_key, pkgid_ids)
+      id = @dbm.send(counter).get_id2(abbrev, name)
+      @dbm.send(pkgid_ids)[@package_id] = id
+      @dbm.increase_counter(@dbm.send(make_key, abbrev))
+      return id
+    end
+    def add_format(format_abbrev, format_name)
+      @format_id = add_property(format_abbrev,
+                                format_name,
+                                :format_id_counter,
+                                :make_ncontents_by_format_key,
+                                :pkgid_fmtids)
+    end
+    def add_license
+      detector = LicenseDetector.new(@content.text)
+      license = detector.detect
+      @license_id = add_property(license.abbrev,
+                                 license.name,
+                                 :license_id_counter,
+                                 :make_ncontents_by_license_key,
+                                 :pkgid_lcsids)
+      @license_abbrev = license.abbrev
+    end
+    def add_path
+      assert_equal(false, @dbm.path_pathid.include?(@normalized_path))
+      @path_id = @dbm.path_id_counter.make_new_id
+      @dbm.path_pathid[@normalized_path] = @path_id
+      @dbm.pathid_path[@path_id] = @normalized_path
+      @dbm.pkgid_pathids[@package_id] = @path_id
+    end
+    def get_fragments(scanner)
+      @@performance_monitor.profile(scanner, :scan) if @config.verbose
+      fragments = []
+      scanner.scan(@content.text) {|fragment|
+        fragments.push(fragment) if LangScan::Type.include?(fragment.type)
+      }
+      fragments = fragments.sort_by {|fragment| fragment.byteno }
+      return fragments
+    end
+    def add_word(word, byteno, type_id)
+      word_id = @dbm.word_id_counter.get_id(word)
+      array = (@wordinfo_cache[word_id] ||= [])
+      array.push(@seqno, byteno, type_id)
+      @seqno += 1
+    end
+    def add_package_if_necessary
+      if @dbm.has_package?(@package_name)
+        @package_id = @dbm.get_package_id(@package_name)
+      else
+        @package_id = @dbm.package_id_counter.make_new_id
+        @dbm.pkg_pkgid[@package_name] = @package_id
+        @dbm.pkgid_pkg[@package_id] = @package_name
+        @dbm.pkgid_src[@package_id] = @source_uri.to_s
+        @dbm.put_package_options(@package_id)
+      end
+    end
+    def make_content_info
+      ContentInfo.dump(@content.length, @content.mtime.to_i,
+                       Time.now.to_i, @format_id, @license_id,
+                       @nlines, @indexed_p)
+    end
+    def index_content(scanner)
+      fragments = []
+      begin
+        fragments = get_fragments(scanner)
+      rescue
+        # fallback to the text scanner
+        unless scanner == LangScan::Text
+          vprintf("#{@normalized_path}: fallback to LangScan::Text")
+          scanner = LangScan::Text
+          retry
+        end
+      end
+      fragments.each {|fragment| add_fragment(fragment) }
+      flush_cache
+      @dbm.increase_counter(:ncontents_indexed)
+      @dbm.increase_counter(:nlines_indexed, @nlines)
+      @indexed_p = true
+    end
+    def add_content_common(format_abbrev, format_name)
+      add_format(format_abbrev, format_name)
+      add_license
+      @dbm.pathid_pkgid[@path_id] = @package_id
+      @dbm.pathid_content[@path_id] = @content.text
+      @dbm.pathid_info[@path_id] = make_content_info
+      @dbm.pathid_hash[@path_id] = @content_hash
+      vprintf("added (%s): %s (%s)", format_abbrev,
+              @normalized_path, @license_abbrev)
+    end
+    def add_binary_content
+      add_content_common("binary", "Binary")
+    end
+    def make_scanner
+      scanner = LangScan.choose(@normalized_path, @content.text)
+      scanner = LangScan::Text if scanner.nil?
+      return scanner
+    end
+    def indexable?(scanner)
+      not @noindex_formats.include?(scanner.abbrev)
+    end
+    def add_content_with_indexing
+      scanner = make_scanner
+      if indexable?(scanner)
+        index_content(scanner)
+      else
+        vprintf("skip indexing: %s", @normalized_path)
+      end
+      add_content_common(scanner.abbrev, scanner.name)
+    end
+    def add_content
+      if @encoding == "binary"
+        add_binary_content
+      else
+        add_content_with_indexing
+      end
+    end
+    public
+    def index
+      read_content
+      add_package_if_necessary
+      add_path
+      add_content
+    end
+  end
+end

data/lib/gonzui/info.rb ADDED

@@ -0,0 +1,80 @@
+#
+# info.rb - information classes
+#
+# Copyright (C) 2004-2005 Satoru Takabayashi <satoru@namazu.org>
+#     All rights reserved.
+#     This is free software with ABSOLUTELY NO WARRANTY.
+#
+# You can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2.
+#
+module Gonzui
+  module BytenoMixin
+    def end_byteno
+      byteno + length
+    end
+    def range
+      byteno ... (byteno + length)
+    end
+  end
+  WordInfo = Struct.new(:word_id, :path_id,
+                        :seqno, :byteno, :type_id, :type, :lineno)
+  class WordInfo
+    include BytenoMixin
+    # dump info
+    DeltaSize = 2
+    UnitSize = 3
+    def match?(target_type)
+      target_type == :all or target_type == self.type
+    end
+  end
+  DigestInfo = Struct.new(:byteno, :length, :type_id, :type)
+  class DigestInfo
+    include BytenoMixin
+    # dump info
+    DeltaSize = 1
+    UnitSize = 3
+  end
+  ContentInfo = Struct.new(:size, :mtime, :itime,
+                           :format_id, :license_id,
+                           :nlines, :indexed_p)
+  class ContentInfo
+    extend Util
+    PACK_FORMAT = "w*"
+    def self.load(dump)
+      info = self.new(*dump.unpack(PACK_FORMAT))
+      info.indexed_p = if info.indexed_p == 1 then true else false end
+      return info
+    end
+    def self.dump(size, mtime, itime, format_id,
+                  license_id,  nlines, indexed_p)
+      indexed_p = if indexed_p then 1 else 0 end
+      # FIXME: It could happen for some cases.
+      if mtime < 0
+        vprintf("minus mtime found: %d", mtime)
+        mtime = Time.now.to_i
+      end
+      [size, mtime, itime, format_id,
+        license_id, nlines, indexed_p].pack(PACK_FORMAT)
+    end
+    def indexed?
+      self.indexed_p
+    end
+  end
+  Occurrence = Struct.new(:byteno, :lineno, :length)
+  class Occurrence
+    include BytenoMixin
+  end
+end