RubyGems - rdig - Versions diffs - 0.1.0 - Mend

rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/CHANGES +2 -0
data/LICENSE +20 -0
data/README +61 -0
data/TODO +0 -0
data/bin/rdig +32 -0
data/doc/examples/config.rb +53 -0
data/install.rb +89 -0
data/lib/htmlentities/CHANGES +21 -0
data/lib/htmlentities/COPYING +7 -0
data/lib/htmlentities/README +15 -0
data/lib/htmlentities/htmlentities.rb +281 -0
data/lib/rdig.rb +243 -0
data/lib/rdig/content_extractors.rb +145 -0
data/lib/rdig/crawler.rb +176 -0
data/lib/rdig/highlight.rb +24 -0
data/lib/rdig/http_client.rb +22 -0
data/lib/rdig/index.rb +39 -0
data/lib/rdig/search.rb +77 -0
data/lib/rdig/url_filters.rb +171 -0
data/rakefile +325 -0
data/test/fixtures/html/custom_tag_selectors.html +25 -0
data/test/fixtures/html/entities.html +15 -0
data/test/fixtures/html/simple.html +17 -0
data/test/test_helper.rb +18 -0
data/test/unit/etag_filter_test.rb +23 -0
data/test/unit/html_content_extractor_test.rb +64 -0
data/test/unit/url_filters_test.rb +96 -0
metadata +102 -0

data/CHANGES ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0.1.0
2	+ initial release

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2006 Jens Kraemer
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README ADDED Viewed

@@ -0,0 +1,61 @@
+= RDig
+RDig provides an HTTP crawler and content extraction utilities
+to help building a site search for web sites or intranets. Internally,
+Ferret is used for the full text indexing. After creating a config file
+for your site, the index can be built with a single call to rdig.
+RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
+== basic usage
+=== Index creation
+- create a config file based on the template in doc/examples
+- to create an index:
+    rdig -c CONFIGFILE
+- to run a query against the index (just to try it out)
+    rdig -c CONFIGFILE -q 'your query'
+  this will dump the first 10 search results to STDOUT
+=== Handle search in your application:
+  require 'rdig'
+  require 'rdig_config'   # load your config file here
+  search_results = RDig.searcher.search(query, options={})
+see RDig::Search::Searcher for more information.
+== usage in rails
+- add to config/environment.rb :
+    require 'rdig'
+    require 'rdig_config'
+- place rdig_config.rb into config/ directory.
+- build index:
+    rdig -c config/rdig_config.rb
+- in your controller that handles the search form:
+    search_results = RDig.searcher.search(params[:query])
+    @results = search_results[:list]
+    @hitcount = search_results[:hitcount]
+=== search result paging
+Use the :first_doc and :num_docs options to implement
+paging through search results.
+(:num_docs is 10 by default, so without using these options only the first 10
+results will be retrieved)
+== sample configuration
+from doc/examples/config.rb. The tag_selector properties are called
+with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
+You can also have a look at the +html_content_extractor+ unit test.
+See [] for API documentation of the
+Rubyful Soup lib used
+:include:doc/examples/config.rb

data/TODO ADDED Viewed

File without changes

data/bin/rdig ADDED Viewed

@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+# run from RAILS_ROOT with
+# ruby -Ilib vendor/plugins/sitesearch/create_index.rb config
+# where config is the name of your config file
+begin
+  require 'rdig'
+rescue LoadError
+  require 'rubygems'
+  require 'rdig'
+end
+RDig.application.run
+#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
+#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
+#require 'init'
+#if ARGV[0]
+#  require ARGV[0]
+#else
+#  require 'config'
+#end
+#include SiteSearch
+#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
+#crawler = Crawler.new
+#crawler.run

data/doc/examples/config.rb ADDED Viewed

@@ -0,0 +1,53 @@
+RDig.configuration do |cfg|
+  ##################################################################
+  # options you should really set
+  # provide one or more URLs for the crawler to start from
+  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
+  # limit the crawl to these hosts. The crawler will never
+  # follow any links pointing to hosts other than those given here.
+  cfg.crawler.include_hosts = [ 'www.example.com' ]
+  # this is the path where the index will be stored
+  # caution, existing contents of this directory will be deleted!
+  cfg.ferret.path        = '/path/to/index'
+  ##################################################################
+  # options you might want to set, the given values are the defaults
+  # content extraction options
+  # provide a method that selects the tag containing the title of a document
+  # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
+  # provide a method that selects the tag containing the page content you
+  # want to index. Useful to avoid indexing common elements like navigation
+  # and page footers for every page.
+  # cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
+  # crawler options
+  # nil (index all documents) or an array of Regexps
+  # matching URLs you want to index.
+  # cfg.crawler.include_documents = nil
+  # nil (no documents excluded) or an array of Regexps
+  # matching URLs not to index.
+  # this filter is used after the one above, so you only need
+  # to exclude documents here that aren't wanted but would be
+  # included by the inclusion patterns.
+  # cfg.crawler.exclude_documents = nil
+  # number of http fetching threads to use
+  # cfg.crawler.num_threads = 2
+  # maximum number of http redirections to follow
+  # cfg.crawler.max_redirects = 5
+  # number of seconds to wait with an empty url queue before
+  # finishing the crawl. Set to a higher number for slow sites
+  # cfg.crawler.wait_before_leave = 10
+end

data/install.rb ADDED Viewed

@@ -0,0 +1,89 @@
+require 'rbconfig'
+require 'find'
+require 'ftools'
+include Config
+$ruby = CONFIG['ruby_install_name']
+##
+# Install a binary file. We patch in on the way through to
+# insert a #! line. If this is a Unix install, we name
+# the command (for example) 'rdig' and let the shebang line
+# handle running it. Under windows, we add a '.rb' extension
+# and let file associations to their stuff
+#
+# based on install.rb from the Rake distribution
+def installBIN(from, opfile)
+  tmp_dir = nil
+  for t in [".", "/tmp", "c:/temp", $bindir]
+    stat = File.stat(t) rescue next
+    if stat.directory? and stat.writable?
+      tmp_dir = t
+      break
+    end
+  end
+  fail "Cannot find a temporary directory" unless tmp_dir
+  tmp_file = File.join(tmp_dir, "_tmp")
+  File.open(from) do |ip|
+    File.open(tmp_file, "w") do |op|
+      ruby = File.join($realbindir, $ruby)
+      op.puts "#!#{ruby} -w"
+      op.write ip.read
+    end
+  end
+  opfile += ".rb" if CONFIG["target_os"] =~ /mswin/i
+  File::install(tmp_file, File.join($bindir, opfile), 0755, true)
+  File::unlink(tmp_file)
+end
+$sitedir = CONFIG["sitelibdir"]
+unless $sitedir
+  version = CONFIG["MAJOR"]+"."+CONFIG["MINOR"]
+  $libdir = File.join(CONFIG["libdir"], "ruby", version)
+  $sitedir = $:.find {|x| x =~ /site_ruby/}
+  if !$sitedir
+    $sitedir = File.join($libdir, "site_ruby")
+  elsif $sitedir !~ Regexp.quote(version)
+    $sitedir = File.join($sitedir, version)
+  end
+end
+$bindir =  CONFIG["bindir"]
+$realbindir = $bindir
+bindir = CONFIG["bindir"]
+if (destdir = ENV['DESTDIR'])
+  $bindir  = destdir + $bindir
+  $sitedir = destdir + $sitedir
+  File::makedirs($bindir)
+  File::makedirs($sitedir)
+end
+rdig_dest = File.join($sitedir, "rdig")
+File::makedirs(rdig_dest, true)
+File::chmod(0755, rdig_dest)
+# The library files
+files = Dir.chdir('lib') { Dir['**/*.rb'] }
+for fn in files
+  fn_dir = File.dirname(fn)
+  target_dir = File.join($sitedir, fn_dir)
+  if ! File.exist?(target_dir)
+    File.makedirs(target_dir)
+  end
+  File::install(File.join('lib', fn), File.join($sitedir, fn), 0644, true)
+end
+# and the executable
+installBIN("bin/rdig", "rdig")

data/lib/htmlentities/CHANGES ADDED Viewed

@@ -0,0 +1,21 @@
+== 2.2 (2005-11-07)
+* Important bug fixes -- thanks to Moonwolf
+* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
+* Decimal decoding edge cases addressed.
+* Test cases added.
+== 2.1 (2005-10-31)
+* Removed some unnecessary code in basic entity encoding.
+* Improved handling of encoding: commands are now automatically sorted, so the
+  user doesn't have to worry about their order.
+* Now using setup.rb.
+* Tests moved to separate file.
+== 2.0 (2005-08-23)
+* Added encoding to entities.
+* Decoding interface unchanged.
+* Fixed a bug with handling high codepoints.
+== 1.0 (2005-08-03)
+* Initial release.
+* Decoding only.

data/lib/htmlentities/COPYING ADDED Viewed

@@ -0,0 +1,7 @@
+Copyright (c) 2005 Paul Battley
+Usage of the works is permitted provided that this instrument is retained
+with the works, so that any entity that uses the works is notified of this
+instrument.
+DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.

data/lib/htmlentities/README ADDED Viewed

@@ -0,0 +1,15 @@
+HTML entity encoding and decoding for Ruby
+This library extends the String class to allow encoding and decoding of
+HTML/XML entities from/to their corresponding UTF-8 codepoints.
+To install (requires root/admin privileges):
+# ruby setup.rb
+To test:
+$ ruby setup.rb test
+Comments are welcome.  Send an email to pbattley @ gmail.com.

data/lib/htmlentities/htmlentities.rb ADDED Viewed

@@ -0,0 +1,281 @@
+#
+# HTML entity encoding and decoding for Ruby
+#
+# Author::  Paul BATTLEY (pbattley @ gmail.com)
+# Version:: 2.2
+# Date::    2005-11-07
+#
+# == About
+#
+# This library extends the String class to allow encoding and decoding of
+# HTML/XML entities from/to their corresponding UTF-8 codepoints.
+#
+# == Licence
+#
+# Copyright (c) 2005 Paul Battley
+#
+# Usage of the works is permitted provided that this instrument is retained
+# with the works, so that any entity that uses the works is notified of this
+# instrument.
+#
+# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
+#
+module HTMLEntities
+    VERSION = '2.2'
+    #
+    # MAP is a hash of all the HTML entities I could discover, as taken
+    # from the w3schools page on the subject:
+    # http://www.w3schools.com/html/html_entitiesref.asp
+    # The format is 'entity name' => codepoint where entity name is given
+    # without the surrounding ampersand and semicolon.
+    #
+    MAP = {
+        'quot'      => 34,
+        'apos'      => 39,
+        'amp'       => 38,
+        'lt'        => 60,
+        'gt'        => 62,
+        'nbsp'      => 160,
+        'iexcl'     => 161,
+        'curren'    => 164,
+        'cent'      => 162,
+        'pound'     => 163,
+        'yen'       => 165,
+        'brvbar'    => 166,
+        'sect'      => 167,
+        'uml'       => 168,
+        'copy'      => 169,
+        'ordf'      => 170,
+        'laquo'     => 171,
+        'not'       => 172,
+        'shy'       => 173,
+        'reg'       => 174,
+        'trade'     => 8482,
+        'macr'      => 175,
+        'deg'       => 176,
+        'plusmn'    => 177,
+        'sup2'      => 178,
+        'sup3'      => 179,
+        'acute'     => 180,
+        'micro'     => 181,
+        'para'      => 182,
+        'middot'    => 183,
+        'cedil'     => 184,
+        'sup1'      => 185,
+        'ordm'      => 186,
+        'raquo'     => 187,
+        'frac14'    => 188,
+        'frac12'    => 189,
+        'frac34'    => 190,
+        'iquest'    => 191,
+        'times'     => 215,
+        'divide'    => 247,
+        'Agrave'    => 192,
+        'Aacute'    => 193,
+        'Acirc'     => 194,
+        'Atilde'    => 195,
+        'Auml'      => 196,
+        'Aring'     => 197,
+        'AElig'     => 198,
+        'Ccedil'    => 199,
+        'Egrave'    => 200,
+        'Eacute'    => 201,
+        'Ecirc'     => 202,
+        'Euml'      => 203,
+        'Igrave'    => 204,
+        'Iacute'    => 205,
+        'Icirc'     => 206,
+        'Iuml'      => 207,
+        'ETH'       => 208,
+        'Ntilde'    => 209,
+        'Ograve'    => 210,
+        'Oacute'    => 211,
+        'Ocirc'     => 212,
+        'Otilde'    => 213,
+        'Ouml'      => 214,
+        'Oslash'    => 216,
+        'Ugrave'    => 217,
+        'Uacute'    => 218,
+        'Ucirc'     => 219,
+        'Uuml'      => 220,
+        'Yacute'    => 221,
+        'THORN'     => 222,
+        'szlig'     => 223,
+        'agrave'    => 224,
+        'aacute'    => 225,
+        'acirc'     => 226,
+        'atilde'    => 227,
+        'auml'      => 228,
+        'aring'     => 229,
+        'aelig'     => 230,
+        'ccedil'    => 231,
+        'egrave'    => 232,
+        'eacute'    => 233,
+        'ecirc'     => 234,
+        'euml'      => 235,
+        'igrave'    => 236,
+        'iacute'    => 237,
+        'icirc'     => 238,
+        'iuml'      => 239,
+        'eth'       => 240,
+        'ntilde'    => 241,
+        'ograve'    => 242,
+        'oacute'    => 243,
+        'ocirc'     => 244,
+        'otilde'    => 245,
+        'ouml'      => 246,
+        'oslash'    => 248,
+        'ugrave'    => 249,
+        'uacute'    => 250,
+        'ucirc'     => 251,
+        'uuml'      => 252,
+        'yacute'    => 253,
+        'thorn'     => 254,
+        'yuml'      => 255,
+        'OElig'     => 338,
+        'oelig'     => 339,
+        'Scaron'    => 352,
+        'scaron'    => 353,
+        'Yuml'      => 376,
+        'circ'      => 710,
+        'tilde'     => 732,
+        'ensp'      => 8194,
+        'emsp'      => 8195,
+        'thinsp'    => 8201,
+        'zwnj'      => 8204,
+        'zwj'       => 8205,
+        'lrm'       => 8206,
+        'rlm'       => 8207,
+        'ndash'     => 8211,
+        'mdash'     => 8212,
+        'lsquo'     => 8216,
+        'rsquo'     => 8217,
+        'sbquo'     => 8218,
+        'ldquo'     => 8220,
+        'rdquo'     => 8221,
+        'bdquo'     => 8222,
+        'dagger'    => 8224,
+        'Dagger'    => 8225,
+        'hellip'    => 8230,
+        'permil'    => 8240,
+        'lsaquo'    => 8249,
+        'rsaquo'    => 8250,
+        'euro'      => 8364
+    }
+    MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
+    MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
+    # Precompile the regexp
+    NAMED_ENTITY_REGEXP =
+        /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
+    # Reverse map for converting characters to named entities
+    REVERSE_MAP = MAP.invert
+    BASIC_ENTITY_REGEXP = /[<>'"&]/
+    UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
+end
+class String
+    # Because there's no need to make the user worry about the order here,
+    # let's handle it.
+    ENCODE_ENTITIES_COMMAND_ORDER = {
+        :basic => 0,
+        :named => 1,
+        :decimal => 2,
+        :hexadecimal => 3
+    }
+    #
+    # Decode XML and HTML 4.01 entities in a string into their UTF-8
+    # equivalents.  Obviously, if your string is not already in UTF-8, you'd
+    # better convert it before using this method, or the output will be mixed
+    # up.
+    # Unknown named entities are not converted
+    #
+    def decode_entities
+        return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
+            HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
+        }.gsub(/&#([0-9]{1,7});/) {
+            [$1.to_i].pack('U')
+        }.gsub(/&#x([0-9a-f]{1,6});/i) {
+            [$1.to_i(16)].pack('U')
+        }
+    end
+    #
+    # Encode codepoints into their corresponding entities.  Various operations
+    # are possible, and may be specified in order:
+    #
+    # :basic :: Convert the five XML entities ('"<>&)
+    # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
+    # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
+    # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
+    #
+    # You can specify the commands in any order, but they will be executed in
+    # the order listed above to ensure that entity ampersands are not
+    # clobbered and that named entities are replaced before numeric ones.
+    #
+    # If no instructions are specified, :basic will be used.
+    #
+    # Examples:
+    #   str.encode_entities - XML-safe
+    #   str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
+    #   str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
+    #   non-ASCII characters replaced with their named entity where possible, and
+    #   decimal equivalents otherwise.
+    #
+    # Note: It is the program's responsibility to ensure that the string
+    # contains valid UTF-8 before calling this method.
+    #
+    def encode_entities(*instructions)
+        str = nil
+        if (instructions.empty?)
+            instructions = [:basic]
+        else
+            instructions.each do |instr|
+                unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
+                    raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
+                end
+            end
+            instructions.sort! { |a,b|
+                ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
+                ENCODE_ENTITIES_COMMAND_ORDER[b]
+            }
+        end
+        instructions.each do |instruction|
+            case instruction
+            when :basic
+                # Handled as basic ASCII
+                str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
+                    # It's safe to use the simpler [0] here because we know
+                    # that the basic entities are ASCII.
+                    '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
+                }
+            when :named
+                # Test everything except printable ASCII
+                str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
+                    cp = $&.unpack('U')[0]
+                    (e = HTMLEntities::REVERSE_MAP[cp]) ?  "&#{e};" : $&
+                }
+            when :decimal
+                str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
+                    "&##{$&.unpack('U')[0]};"
+                }
+            when :hexadecimal
+                str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
+                    "&#x#{$&.unpack('U')[0].to_s(16)};"
+                }
+            end
+        end
+        return str
+    end
+end