RubyGems - rdig - Versions diffs - 0.1.0 - Mend

rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/CHANGES +2 -0
data/LICENSE +20 -0
data/README +61 -0
data/TODO +0 -0
data/bin/rdig +32 -0
data/doc/examples/config.rb +53 -0
data/install.rb +89 -0
data/lib/htmlentities/CHANGES +21 -0
data/lib/htmlentities/COPYING +7 -0
data/lib/htmlentities/README +15 -0
data/lib/htmlentities/htmlentities.rb +281 -0
data/lib/rdig.rb +243 -0
data/lib/rdig/content_extractors.rb +145 -0
data/lib/rdig/crawler.rb +176 -0
data/lib/rdig/highlight.rb +24 -0
data/lib/rdig/http_client.rb +22 -0
data/lib/rdig/index.rb +39 -0
data/lib/rdig/search.rb +77 -0
data/lib/rdig/url_filters.rb +171 -0
data/rakefile +325 -0
data/test/fixtures/html/custom_tag_selectors.html +25 -0
data/test/fixtures/html/entities.html +15 -0
data/test/fixtures/html/simple.html +17 -0
data/test/test_helper.rb +18 -0
data/test/unit/etag_filter_test.rb +23 -0
data/test/unit/html_content_extractor_test.rb +64 -0
data/test/unit/url_filters_test.rb +96 -0
metadata +102 -0

data/lib/rdig/highlight.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module RDig
+  module Search
+    # beginning of a port of the Query term highlighter from Lucene contrib
+    class Highlighter
+      def initialize
+        @analyzer = RDig.config.ferret.analyzer
+      end
+      def best_fragments(scorer, text, max_fragments = 1)
+        token_stream = @analyzer.token_stream('body', text)
+        frag_texts = []
+        get_best_text_fragments(token_stream, text, max_fragments).each { |frag|
+          frag_texts << frag.to_s if (frag && frag.score > 0)
+        }
+        return frag_texts
+      end
+      def get_best_text_fragments(token_stream, text, max_fragments)
+      end
+    end
+  end
+end

data/lib/rdig/http_client.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module RDig
+  module HttpClient
+    def do_get(uri, user_agent='RDig crawler')
+      # Set up the appropriate http headers
+      headers = { "User-Agent" => user_agent }
+      result = {}
+      begin
+        Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
+          final_uri = uri.path
+          final_uri += ('?' + uri.query) if uri.query
+          return http.get(final_uri, headers)
+        }
+      rescue => error
+        puts error
+      end
+    end
+  end
+end

data/lib/rdig/index.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module RDig
+  module Index
+    # used by the crawler to build the ferret index
+    class Indexer
+      include MonitorMixin, Ferret::Index, Ferret::Document
+      def initialize(settings)
+        #@ferret_config = settings
+        @index_writer = IndexWriter.new(settings.path,
+                                        :create   => settings.create,
+                                        :analyzer => settings.analyzer)
+        super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
+      end
+      def add_to_index(document)
+        puts "add to index: #{document.uri.to_s}"
+        doc = Ferret::Document::Document.new
+        doc << Field.new("url", document.url,
+                        Field::Store::YES, Field::Index::UNTOKENIZED)
+        doc << Field.new("title", document.title,
+                        Field::Store::YES, Field::Index::TOKENIZED)
+        doc << Field.new("data",  document.body,
+                        Field::Store::YES, Field::Index::TOKENIZED)
+        synchronize do
+          @index_writer << doc
+        end
+      end
+      alias :<< :add_to_index
+      def close
+        @index_writer.optimize
+        @index_writer.close
+        @index_writer = nil
+      end
+    end
+  end
+end

data/lib/rdig/search.rb ADDED Viewed

@@ -0,0 +1,77 @@
+module RDig
+  module Search
+    # This class is used to search the index.
+    # Call RDig::searcher to retrieve an instance ready for use.
+    class Searcher
+      include Ferret::Search
+      # the query parser used to parse query strings
+      attr_reader :query_parser
+      # takes the ferret section of the rdig configuration as a parameter.
+      def initialize(settings)
+        @ferret_config = settings
+        @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
+        ferret_searcher
+      end
+      # returns the Ferret::Search::IndexSearcher instance used internally.
+      def ferret_searcher
+        if @ferret_searcher and !@ferret_searcher.reader.latest?
+          # reopen searcher
+          @ferret_searcher.close
+          @ferret_searcher = nil
+        end
+        unless @ferret_searcher
+          @ferret_searcher = IndexSearcher.new(@ferret_config.path)
+          @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
+        end
+        @ferret_searcher
+      end
+      # run a search.
+      # +query+ usually will be a user-entered string. See the Ferret query
+      # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
+      # for more information on queries.
+      # A Ferret::Search::Query instance may be given, too.
+      #
+      # Otions are:
+      # first_doc:: first document in result list to retrieve (0-based). The default is 0.
+      # num_docs:: number of documents to retrieve. The default is 10.
+      def search(query, options={})
+        result = {}
+        query = query_parser.parse(query) if query.is_a?(String)
+        puts "Query: #{query}"
+        hits = ferret_searcher.search(query, options)
+        result[:hitcount] = hits.total_hits
+        results = []
+        hits.each { |doc_id,score|
+          doc = ferret_searcher.reader.get_document doc_id
+          results << { :score => score,
+                      :title => doc['title'],
+                      :url => doc['url'],
+                      :extract => build_extract(doc['data']) }
+        }
+        result[:list] = results
+        result
+      end
+      def build_extract(data)
+        (data && data.length > 200) ? data[0..200] : data
+      end
+    end
+  #  class SearchResult < OpenStruct
+  #    def initialize(doc, score)
+  #      self.score = score
+  #      self.title = doc[:title]
+  #      self.url = doc[:url]
+  #      self.extract = doc[:content][0..200]
+  #    end
+  #  end
+  end
+end

data/lib/rdig/url_filters.rb ADDED Viewed

@@ -0,0 +1,171 @@
+module RDig
+  module UrlFilters
+    class FilterChain
+      def initialize(chain_config)
+        @filters = []
+        chain_config.each { |filter|
+          case filter
+          when Hash
+            filter.each_pair { |f, args|
+              add(f, args)
+            }
+          when Array
+            args = filter
+            filter = args.shift
+            add(filter, args)
+          else
+            add(filter)
+          end
+        }
+      end
+      # add a filter and it's args to the chain
+      # when args is a symbol, it is treated as a configuration key
+      def add(filter, args=nil)
+        args = RDig.config.crawler.send(args) if args.is_a? Symbol
+        case filter
+        when Symbol
+          if args.nil?
+            @filters << lambda { |document|
+              UrlFilters.send(filter, document)
+            }
+          else
+            @filters << lambda { |document|
+              UrlFilters.send(filter, document, args)
+            }
+          end
+        when Class
+          if args.nil?
+            if filter.respond_to?(:instance)
+              filter_instance = filter.instance
+            else
+              filter_instance = filter.new
+            end
+          else
+            filter_instance = filter.new(args)
+          end
+          @filters << lambda { |document|
+            filter_instance.apply(document)
+          }
+        end
+      end
+      def apply(document)
+        @filters.each { |filter|
+          return nil unless filter.call(document)
+        }
+        return document
+      end
+    end
+    # takes care of a list of all Urls visited during a crawl, to avoid
+    # indexing pages more than once
+    # implemented as a thread safe singleton as it has to be shared
+    # between all crawler threads
+    class VisitedUrlFilter
+      include MonitorMixin, Singleton
+      def initialize
+        @visited_urls = Set.new
+        super
+      end
+      # return document if this document's url has not been visited yet,
+      # nil otherwise
+      def apply(document)
+        synchronize do
+          @visited_urls.add?(document.uri.to_s) ? document : nil
+        end
+      end
+    end
+    # base class for url inclusion / exclusion filters
+    class UrlPatternFilter
+      # takes an Array of Regexps, or nil to disable the filter
+      def initialize(args=nil)
+        unless args.nil?
+          @patterns = []
+          if args.respond_to? :each
+            args.each { |pattern|
+              # cloning because unsure if regexps are thread safe...
+              @patterns << pattern.clone
+            }
+          else
+            @patterns << args.clone
+          end
+        end
+      end
+    end
+    class UrlExclusionFilter < UrlPatternFilter
+      # returns nil if any of the patterns matches it's URL,
+      # the document itself otherwise
+      def apply(document)
+        return document unless @patterns
+        @patterns.each { |p|
+          return nil if document.uri.to_s =~ p
+        }
+        return document
+      end
+    end
+    class UrlInclusionFilter < UrlPatternFilter
+      # returns nil if any of the patterns matches it's URL,
+      # the document itself otherwise
+      def apply(document)
+        return document unless @patterns
+        @patterns.each { |p|
+          return document if document.uri.to_s =~ p
+        }
+        return nil
+      end
+    end
+    # checks redirect count of the given document
+    # takes it out of the chain if number of redirections exceeds the
+    # max_redirects setting
+    def UrlFilters.maximum_redirect_filter(document, max_redirects)
+      return nil if document.redirections > max_redirects
+      return document
+    end
+    # expands both href="/path/xyz.html" and href="affe.html"
+    # to full urls
+    def UrlFilters.fix_relative_uri(document)
+      return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
+      ref = document.referring_uri
+      return document unless ref
+      uri = document.uri
+      uri.scheme = ref.scheme unless uri.scheme
+      uri.host = ref.host unless uri.host
+      uri.port = ref.port unless uri.port || ref.port==ref.default_port
+      uri.path = ref.path unless uri.path
+      if uri.path !~ /^\//
+        ref_path = ref.path || '/'
+        ref_path << '/' if ref_path.empty?
+        uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
+      end
+      return document
+    end
+    def UrlFilters.hostname_filter(document, include_hosts)
+      return document if include_hosts.include?(document.uri.host)
+      return nil
+    end
+    def UrlFilters.normalize_uri(document)
+      document.uri.fragment = nil
+      # document.uri.query = nil
+      # append index document if configured and path ends with a slash
+      if RDig.config.index_document && document.uri.path =~ /\/$/
+        document.uri.path << RDig.config.index_document
+      end
+      return document
+    end
+  end
+end

data/rakefile ADDED Viewed

@@ -0,0 +1,325 @@
+# rakefile for RDig.
+# large parts borrowed from rake's Rakefile
+begin
+  require 'rubygems'
+  require 'rake/gempackagetask'
+rescue Exception
+  nil
+end
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/packagetask'
+require 'rake/contrib/rubyforgepublisher'
+def announce(msg='')
+  STDERR.puts msg
+end
+PKG_NAME      = 'rdig'
+# Determine the current version of the software
+if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
+  CURRENT_VERSION = $1
+else
+  CURRENT_VERSION = "0.0.0"
+end
+if ENV['REL']
+  PKG_VERSION = ENV['REL']
+else
+  PKG_VERSION = CURRENT_VERSION
+end
+SRC_RB = FileList['lib/**/*.rb']
+PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
+RELEASE_NAME  = "REL #{PKG_VERSION}"
+RUBY_FORGE_PROJECT = "rdig"
+RUBY_FORGE_USER    = "jkraemer"
+PKG_FILES = FileList[
+    "bin/**/*",
+    "lib/**/*",
+    "test/**/*",
+    "doc/**/*",
+    "[A-Z]*",
+    "install.rb",
+    "rakefile"
+].exclude(/\.svn|~$|\.swp$/)
+desc "Default Task"
+task :default => [ :test_all ]
+# Test Tasks -------------------------------------------------------------
+task :ta => :test_all
+task :tf => :test_functional
+task :tu => :test_units
+# Run all tests
+Rake::TestTask.new("test_all") { |t|
+  t.test_files = FileList[
+    'test/unit/*_test.rb',
+    'test/functional/*_test.rb'
+  ]
+  t.libs << "test"
+  #t.warning = true
+  t.verbose = true
+}
+# Run unit tests
+Rake::TestTask.new("test_units") { |t|
+  t.test_files = FileList[ 'test/unit/*_test.rb' ]
+  t.libs << "test"
+  #t.warning = true
+  t.verbose = true
+}
+# Run functional tests
+Rake::TestTask.new("test_functional") { |t|
+  t.test_files = FileList[ 'test/functional/*_test.rb' ]
+  t.libs << "test"
+  #t.warning = true
+  t.verbose = true
+}
+# Generate the RDoc documentation ----------------------------------------
+rd = Rake::RDocTask.new { |rdoc|
+  rdoc.rdoc_dir = 'doc/html'
+  rdoc.title    = "RDig - Ferret based full text search for web sites"
+  rdoc.options << '--line-numbers' << '--inline-source'
+  rdoc.options << '--main' << 'README'
+  rdoc.template = "#{ENV['template']}.rb" if ENV['template']
+  rdoc.rdoc_files.include('README', 'CHANGES', 'LICENSE', 'TODO')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+}
+# packaging --------------------------------------------------------------
+# ====================================================================
+# Create a task that will package the software into distributable
+# tar, zip and gem files.
+if ! defined?(Gem)
+  puts "Package Target requires RubyGEMs"
+else
+  spec = Gem::Specification.new do |s|
+    #### Basic information.
+    s.name = 'rdig'
+    s.version = PKG_VERSION
+    s.summary = "Ruby based web site indexing and searching library."
+    s.description = <<-EOF
+    RDig provides an HTTP crawler and content extraction utilities
+    to help building a site search for web sites or intranets. Internally,
+    Ferret is used for the full text indexing. After creating a config file
+    for your site, the index can be built with a single call to rdig.
+    EOF
+    #### Dependencies and requirements.
+    s.add_dependency('ferret', '>= 0.3.2')
+    s.add_dependency('rubyful_soup', '>= 1.0.4')
+    #s.requirements << ""
+    #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)
+    s.files = PKG_FILES.to_a
+    #### Load-time details: library and application (you will need one or both).
+    s.require_path = 'lib'                  # Use these for libraries.
+    s.bindir = "bin"                        # Use these for applications.
+    s.executables = ["rdig"]
+    s.default_executable = "rdig"
+    #### Documentation and testing.
+    s.has_rdoc = true
+    s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
+    s.rdoc_options <<
+      '--title' <<  'Rake -- Ruby Make' <<
+      '--main' << 'README' <<
+      '--line-numbers'
+    #### Author and project details.
+    s.author = "Jens Kraemer"
+    s.email = "jk@jkraemer.net"
+    s.homepage = "http://rdig.rubyforge.org/"
+    s.rubyforge_project = "rdig"
+#     if ENV['CERT_DIR']
+#       s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
+#       s.cert_chain  = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
+#     end
+  end
+  package_task = Rake::GemPackageTask.new(spec) do |pkg|
+    pkg.need_zip = true
+    pkg.need_tar = true
+  end
+end
+# misc ----------------------------------------------------------------
+def count_lines(filename)
+  lines = 0
+  codelines = 0
+  open(filename) { |f|
+    f.each do |line|
+      lines += 1
+      next if line =~ /^\s*$/
+      next if line =~ /^\s*#/
+      codelines += 1
+    end
+  }
+  [lines, codelines]
+end
+def show_line(msg, lines, loc)
+  printf "%6s %6s   %s\n", lines.to_s, loc.to_s, msg
+end
+desc "Count lines in the main rake file"
+task :lines do
+  total_lines = 0
+  total_code = 0
+  show_line("File Name", "LINES", "LOC")
+  SRC_RB.each do |fn|
+    lines, codelines = count_lines(fn)
+    show_line(fn, lines, codelines)
+    total_lines += lines
+    total_code  += codelines
+  end
+  show_line("TOTAL", total_lines, total_code)
+end
+# Define an optional publish target in an external file.  If the
+# publish.rf file is not found, the publish targets won't be defined.
+load "publish.rf" if File.exist? "publish.rf"
+# Support Tasks ------------------------------------------------------
+desc "Look for TODO and FIXME tags in the code"
+task :todo do
+  FileList['**/*.rb'].exclude('pkg').egrep /#.*(FIXME|TODO|TBD)/
+end
+desc "Look for Debugging print lines"
+task :dbg do
+  FileList['**/*.rb'].egrep /\bDBG|\bbreakpoint\b/
+end
+desc "List all ruby files"
+task :rubyfiles do
+  puts Dir['**/*.rb'].reject { |fn| fn =~ /^pkg/ }
+  puts Dir['bin/*'].reject { |fn| fn =~ /CVS|(~$)|(\.rb$)/ }
+end
+task :rf => :rubyfiles
+# --------------------------------------------------------------------
+# Creating a release
+desc "Make a new release"
+task :release => [
+      :prerelease,
+      :clobber,
+      :test_all,
+      :update_version,
+      :package,
+      :tag] do
+  announce
+  announce "**************************************************************"
+  announce "* Release #{PKG_VERSION} Complete."
+  announce "* Packages ready to upload."
+  announce "**************************************************************"
+  announce
+end
+# Validate that everything is ready to go for a release.
+task :prerelease do
+  announce
+  announce "**************************************************************"
+  announce "* Making RubyGem Release #{PKG_VERSION}"
+  announce "* (current version #{CURRENT_VERSION})"
+  announce "**************************************************************"
+  announce
+  # Is a release number supplied?
+  unless ENV['REL']
+    fail "Usage: rake release REL=x.y.z [REUSE=tag_suffix]"
+  end
+  # Is the release different than the current release.
+  # (or is REUSE set?)
+  if PKG_VERSION == CURRENT_VERSION && ! ENV['REUSE']
+    fail "Current version is #{PKG_VERSION}, must specify REUSE=tag_suffix to reuse version"
+  end
+  # Are all source files checked in?
+  if ENV['RELTEST']
+    announce "Release Task Testing, skipping checked-in file test"
+  else
+    announce "Checking for unchecked-in files..."
+    data = `svn st`
+    unless data =~ /^$/
+      fail "SVN status is not clean ... do you have unchecked-in files?"
+    end
+    announce "No outstanding checkins found ... OK"
+  end
+end
+task :update_version => [:prerelease] do
+  if PKG_VERSION == CURRENT_VERSION
+    announce "No version change ... skipping version update"
+  else
+    announce "Updating RDig version to #{PKG_VERSION}"
+    open("lib/rdig.rb") do |rakein|
+      open("lib/rdig.rb.new", "w") do |rakeout|
+        rakein.each do |line|
+          if line =~ /^RDIGVERSION\s*=\s*/
+            rakeout.puts "RDIGVERSION = '#{PKG_VERSION}'"
+          else
+            rakeout.puts line
+          end
+        end
+      end
+    end
+    mv "lib/rdig.rb.new", "lib/rdig.rb"
+    if ENV['RELTEST']
+      announce "Release Task Testing, skipping commiting of new version"
+    else
+      sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
+    end
+  end
+end
+desc "Tag all files with the latest release number (REL=x.y.z)"
+task :tag => [:prerelease] do
+  reltag = "REL_#{PKG_VERSION.gsub(/\./, '_')}"
+  reltag << ENV['REUSE'].gsub(/\./, '_') if ENV['REUSE']
+  announce "Tagging with [#{reltag}]"
+  if ENV['RELTEST']
+    announce "Release Task Testing, skipping tagging"
+  else
+    sh %{cd ..; svn copy trunk tags/#{reltag}}
+  end
+end