RubyGems - xapian-indexer - Versions diffs - 1.2.3.1 - Mend

xapian-indexer 1.2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/lib/xapian/indexer.rb +12 -0
data/lib/xapian/indexer/extensions.rb +45 -0
data/lib/xapian/indexer/extractors/html.rb +106 -0
data/lib/xapian/indexer/loaders/http.rb +62 -0
data/lib/xapian/indexer/resource.rb +165 -0
data/lib/xapian/indexer/spider.rb +182 -0
data/lib/xapian/indexer/version.rb +27 -0
metadata +74 -0

data/lib/xapian/indexer.rb ADDED

@@ -0,0 +1,12 @@
+require 'logger'
+require 'xapian/indexer/extensions'
+require 'xapian/indexer/version'
+require 'xapian/indexer/resource'
+require 'xapian/indexer/spider'
+module Xapian
+	module Indexer
+	end
+end

data/lib/xapian/indexer/extensions.rb ADDED

@@ -0,0 +1,45 @@
+# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require 'uri'
+class URI::Generic
+	def absolute_path?
+		path.match('^/')
+	end
+	def relative_path?
+		!absolute_path?
+	end
+	# Behavior in 1.8.7 seems to be broken...?
+	def merge0(oth)
+		case oth
+		when Generic
+		when String
+			oth = URI.parse(oth)
+		else
+			raise ArgumentError, "bad argument(expected URI object or URI string)"
+		end
+		if oth.absolute?
+			return oth, oth
+		else
+			return self.dup, oth
+		end
+	end
+end
+# puts URI.parse("/bob/dole") + URI.parse("http://www.lucidsystems.org")

data/lib/xapian/indexer/extractors/html.rb ADDED

@@ -0,0 +1,106 @@
+# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require 'nokogiri'
+module Xapian
+	module Indexer
+		module Extractors
+			# Represents a resource that will be indexed
+			class HTML
+				def initialize(options = {})
+					@options = options
+					@logger = options[:logger] || Logger.new($stderr)
+				end
+				def call(resource, status, headers, data)
+					html = Nokogiri::HTML.parse(data)
+					result = {}
+					# Extract description
+					meta_description = html.css("meta[name='description']").first
+					if meta_description
+						result[:description] = meta_description['content']
+					else
+						# Use the first paragraph as a description
+						first_paragraph = html.search("p").first
+						if first_paragraph
+							result[:description] = first_paragraph.inner_text
+						end
+					end
+					base_tag = html.at('html/head/base')
+					if base_tag
+						base = URI.parse(base_tag['href'])
+					else
+						base = URI.parse(resource.name)
+					end
+					links = []
+					html.css('a').each do |link|
+						href = (link['href'] || "").to_s.gsub(/ /, '%20')
+						# No scheme but starts with a '/'
+						#begin
+							links << (base + href)
+						#rescue
+						#	$stderr.puts "Could not add link #{href}: #{$!}"
+						#end
+					end
+					# Remove any fragment at the end of the URI.
+					links.each{|link| link.fragment = nil}
+					# Convert to strings and uniq.
+					result[:links] = links.map{|link| link.to_s}.uniq
+					#$stderr.puts "Extracted links = #{result[:links].inspect}"
+					# Extract title
+					title_tag = html.at('html/head/title')
+					h1_tag = html.search('h1').first
+					if title_tag
+						result[:title] = title_tag.inner_text
+					elsif h1_tag
+						result[:title] = h1_tag.inner_text
+					end
+					# Extract keywords
+					meta_keywords = html.css("meta[name='keyword']").first
+					if meta_keywords
+						result[:keywords] = meta_keywords['content']
+					end
+					# Remove junk elements from the html
+					html.search("script").remove
+					html.search("link").remove
+					html.search("meta").remove
+					html.search("style").remove
+					html.search("form").remove
+					html.css('.noindex').remove
+					result[:content] = html.at('html/body').inner_text
+					return result
+				end
+			end
+		end
+	end
+end

data/lib/xapian/indexer/loaders/http.rb ADDED

@@ -0,0 +1,62 @@
+# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require 'net/http'
+require 'xapian/indexer/version'
+module Xapian
+	module Indexer
+		module Loaders
+			class HTTP
+				UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION::STRING}"
+				def initialize(options = {})
+					@options = options
+					@logger = options[:logger] || Logger.new($stderr)
+				end
+				# Extract metadata from the document, including :content and :links
+				def call(name, &block)
+					uri = URI.parse(name)
+					if uri.absolute?
+						Net::HTTP.start(uri.host, uri.port) do |http|
+							head = http.request_head(uri.path, 'User-Agent' => UserAgent)
+							body = lambda do
+								page = http.request_get(uri.path, 'User-Agent' => UserAgent)
+								page.body
+							end
+							@logger.info "Loading external URI: #{name.inspect}"
+							yield head.code.to_i, head.header, body
+						end
+						return true
+					end
+					return false
+				end
+			end
+		end
+	end
+end

data/lib/xapian/indexer/resource.rb ADDED

@@ -0,0 +1,165 @@
+# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require 'digest/md5'
+module Xapian
+	module Indexer
+		class Controller
+			def initialize(options = {})
+				@extractors = {}
+				@loaders = []
+				@logger = options[:logger] || Logger.new($stderr)
+			end
+			attr :loaders
+			attr :extractors
+			def create(name)
+				Resource.new(name, self)
+			end
+			def load(resource, &block)
+				@loaders.each do |loader|
+					loader.call(resource.name) do |status, header, load_body|
+						if status >= 200 && status < 300
+							# Process the page content
+							mime_type = header['content-type'].split(";").first
+							extractor = @extractors[mime_type]
+							if extractor
+								body = load_body.call
+								metadata = extractor.call(resource, status, header, body)
+								# Load the data into the resource
+								yield status, header, body, metadata
+								return true
+							else
+								@logger.warn "Ignoring resource #{resource.name} because content-type #{mime_type} is not supported."
+								return false
+							end
+						elsif status >= 300 && status < 400
+							# Process the redirect
+							location = URI.parse(resource.name) + header['location']
+							metadata = {
+								:links => [location.to_s]
+							}
+							# This resource is not indexable, using nil for body
+							yield status, header, nil, metadata
+						end
+					end
+				end
+				return false
+			end
+			def save(resource)
+				YAML::dump(resource.to_hash)
+			end
+			def recreate(data)
+				values = YAML::load(data)
+				Resource.new(values[:name], self, values)
+			end
+		end
+		# Represents a resource that will be indexed
+		class Resource
+			def initialize(name, controller, values = {})
+				@name = name
+				@controller = controller
+				@fetched_on = values[:fetched_on]
+				@status = values[:status]
+				@header = values[:header]
+				@body = values[:body]
+				@metadata = values[:metadata]
+			end
+			attr :name
+			attr :status
+			attr :header
+			attr :body
+			attr :metadata
+			def to_hash
+				{
+					:fetched_on => @fetched_on,
+					:name => @name,
+					:status => @status,
+					:header => @header,
+					:body => @body,
+					:metadata => @metadata
+				}
+			end
+			# The data that will be indexed
+			def content
+				[@metadata[:content] || @body, @metadata[:title], @metadata[:description], @metadata[:keywords]].compact.join(" ")
+			end
+			def links
+				@metadata[:links] if @metadata
+			end
+			def fresh?(at = Time.now)
+				cache_control = @header['cache-control'] || ""
+				fetched_age = @header['age'] || ""
+				max_age = 3600
+				if cache_control.match(/max-age=([0-9]+)/)
+					max_age = $1.to_i
+					if fetched_age.match(/([0-9]+)/)
+						max_age -= $1.to_i
+					end
+				end
+				age = at - @fetched_on
+				# If the page is younger than the max_age the page can be considered fresh.
+				return age < max_age
+			end
+			def fetch!
+				@controller.load(self) do |status, header, body, metadata|
+					@fetched_on = Time.now
+					@status = status
+					@header = header
+					@body = body
+					@metadata = metadata
+				end
+			end
+			def fetched?
+				@fetched_on != nil
+			end
+			def content?
+				@body != nil
+			end
+			def name_digest
+				"Q" + Digest::MD5.hexdigest(@name)
+			end
+		end
+	end
+end

data/lib/xapian/indexer/spider.rb ADDED

@@ -0,0 +1,182 @@
+# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require 'xapian'
+require 'set'
+module Xapian
+	module Indexer
+		# Represents a process which consumes resources into the database
+		# and follows links to related resources
+		class Spider
+			# database = Xapian::Database.new(ARGV[0])
+			def initialize(database, generator, controller, options = {})
+				@database = database
+				@generator = generator
+				@controller = controller
+				@links = []
+				@touched = Set.new
+				@logger = options[:logger] || Logger.new($stdout)
+			end
+			attr :resources
+			def add(root)
+				case root
+				when String
+					@links << root
+				when Array
+					@links += root
+				else
+					@logger.error "Could not add roots #{root.inspect}!"
+				end
+			end
+			class Fetch
+				def initialize(database, controller, link)
+					@database = database
+					@controller = controller
+					@document = false
+					@current_resource = controller.create(link)
+					@archived_resource = false
+				end
+				attr :database
+				attr :controller
+				attr :current_resource
+				def document
+					if @document === false
+						postlist = @database.postlist(@current_resource.name_digest)
+						if postlist.size > 0
+							@document = @database.document(postlist[0].docid)
+						else
+							@document = nil
+						end
+					end
+					return @document
+				end
+				def archived_resource
+					if @archived_resource === false
+						if document
+							@archived_resource = @controller.recreate(document.data)
+						end
+					end
+					return @archived_resource
+				end
+				def links
+					#$stderr.puts "current_resource.links = #{@current_resource.links.inspect}" if @current_resource
+					#$stderr.puts "archived_resource.links = #{archived_resource.links.inspect}" if archived_resource
+					if @current_resource.fetched?
+						@current_resource.links
+					elsif archived_resource
+						archived_resource.links
+					end
+				end
+			end
+			def process(options = {}, &block)
+				count = 0
+				depth = 0
+				until @links.empty?
+					new_links = []
+					@links.each do |link|
+						# Mark and sweep - don't review the same resource twice!
+						next if @touched.include?(link)
+						@touched << link
+						# Create a new fetch from the database...
+						fetch = Fetch.new(@database, @controller, link)
+						resource = fetch.current_resource
+						# Does it already exist in the current database (and fresh?)
+						unless fetch.archived_resource && fetch.archived_resource.fresh?
+							# Fetch the resource and add it to the index
+							begin
+								@logger.info "Indexing #{resource.name}..."
+								resource.fetch!
+							rescue
+								@logger.error "Could not fetch resource #{resource.name}: #{$!}!"
+								$!.backtrace.each{|line| @logger.error(line)}
+							end
+							# Did we fetch a resource and was it indexable?
+							if resource.fetched?
+								if resource.content?
+									doc = Xapian::Document.new
+									doc.data = @controller.save(resource)
+									doc.add_term(resource.name_digest)
+									@generator.document = doc
+									@generator.index_text(resource.content)
+									@database.replace_document(resource.name_digest, doc)
+								else
+									@logger.warn "Resource was not indexable #{resource.name}!"
+									@logger.warn "Links = #{(fetch.links || []).map(&block).compact.inspect}"
+								end
+							else
+								@logger.warn "Could not fetch resource #{resource.name}!"
+							end
+						else
+							@logger.info "Still fresh #{resource.name}..."
+						end
+						new_links += (fetch.links || []).map(&block).compact
+						count += 1
+						if options[:count] && count > options[:count]
+							# If we have to leave before finishing this breadth...
+							@links += new_links
+							return count
+						end
+					end
+					@links = new_links
+					depth += 1
+					return count if options[:depth] && depth > options[:depth]
+				end
+			end
+			def remove_old!
+				postlist = @database.postlist("")
+				postlist.each do |post|
+					document = @database.document(post.docid)
+					resource = @controller.recreate(document.data)
+					unless resource.fresh?
+						@logger.info "Removing expired index for #{resource.name}."
+						@database.delete_document(post.docid)
+					end
+				end
+			end
+		end
+	end
+end

data/lib/xapian/indexer/version.rb ADDED

@@ -0,0 +1,27 @@
+# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+module Xapian
+	module Indexer
+		module VERSION #:nodoc:
+			MAJOR = 1
+			MINOR = 2
+			TINY  = 3
+			REV   = 1
+			STRING = [MAJOR, MINOR, TINY, REV].join('.')
+		end
+	end
+end

metadata ADDED

@@ -0,0 +1,74 @@
+--- !ruby/object:Gem::Specification
+name: xapian-indexer
+version: !ruby/object:Gem::Version
+  hash: 65
+  prerelease: false
+  segments:
+  - 1
+  - 2
+  - 3
+  - 1
+  version: 1.2.3.1
+platform: ruby
+authors:
+- Samuel Williams
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-12-19 00:00:00 +13:00
+default_executable:
+dependencies: []
+description:
+email: samuel.williams@oriontransfer.co.nz
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/xapian/indexer/extensions.rb
+- lib/xapian/indexer/extractors/html.rb
+- lib/xapian/indexer/loaders/http.rb
+- lib/xapian/indexer/resource.rb
+- lib/xapian/indexer/spider.rb
+- lib/xapian/indexer/version.rb
+- lib/xapian/indexer.rb
+has_rdoc: true
+homepage: http://www.oriontransfer.co.nz/software/xapian
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Xapian is a framework for fast full-text searching.
+test_files: []