xapian-indexer 1.2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+
2
+ require 'logger'
3
+
4
+ require 'xapian/indexer/extensions'
5
+ require 'xapian/indexer/version'
6
+ require 'xapian/indexer/resource'
7
+ require 'xapian/indexer/spider'
8
+
9
+ module Xapian
10
+ module Indexer
11
+ end
12
+ end
@@ -0,0 +1,45 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'uri'
17
+
18
+ class URI::Generic
19
+ def absolute_path?
20
+ path.match('^/')
21
+ end
22
+
23
+ def relative_path?
24
+ !absolute_path?
25
+ end
26
+
27
+ # Behavior in 1.8.7 seems to be broken...?
28
+ def merge0(oth)
29
+ case oth
30
+ when Generic
31
+ when String
32
+ oth = URI.parse(oth)
33
+ else
34
+ raise ArgumentError, "bad argument(expected URI object or URI string)"
35
+ end
36
+
37
+ if oth.absolute?
38
+ return oth, oth
39
+ else
40
+ return self.dup, oth
41
+ end
42
+ end
43
+ end
44
+
45
+ # puts URI.parse("/bob/dole") + URI.parse("http://www.lucidsystems.org")
@@ -0,0 +1,106 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'nokogiri'
17
+
18
+ module Xapian
19
+ module Indexer
20
+ module Extractors
21
+ # Represents a resource that will be indexed
22
+ class HTML
23
+ def initialize(options = {})
24
+ @options = options
25
+
26
+ @logger = options[:logger] || Logger.new($stderr)
27
+ end
28
+
29
+ def call(resource, status, headers, data)
30
+ html = Nokogiri::HTML.parse(data)
31
+ result = {}
32
+
33
+ # Extract description
34
+ meta_description = html.css("meta[name='description']").first
35
+
36
+ if meta_description
37
+ result[:description] = meta_description['content']
38
+ else
39
+ # Use the first paragraph as a description
40
+ first_paragraph = html.search("p").first
41
+
42
+ if first_paragraph
43
+ result[:description] = first_paragraph.inner_text
44
+ end
45
+ end
46
+
47
+ base_tag = html.at('html/head/base')
48
+ if base_tag
49
+ base = URI.parse(base_tag['href'])
50
+ else
51
+ base = URI.parse(resource.name)
52
+ end
53
+
54
+ links = []
55
+
56
+ html.css('a').each do |link|
57
+ href = (link['href'] || "").to_s.gsub(/ /, '%20')
58
+
59
+ # No scheme but starts with a '/'
60
+ #begin
61
+ links << (base + href)
62
+ #rescue
63
+ # $stderr.puts "Could not add link #{href}: #{$!}"
64
+ #end
65
+ end
66
+
67
+ # Remove any fragment at the end of the URI.
68
+ links.each{|link| link.fragment = nil}
69
+
70
+ # Convert to strings and uniq.
71
+ result[:links] = links.map{|link| link.to_s}.uniq
72
+
73
+ #$stderr.puts "Extracted links = #{result[:links].inspect}"
74
+
75
+ # Extract title
76
+ title_tag = html.at('html/head/title')
77
+ h1_tag = html.search('h1').first
78
+ if title_tag
79
+ result[:title] = title_tag.inner_text
80
+ elsif h1_tag
81
+ result[:title] = h1_tag.inner_text
82
+ end
83
+
84
+ # Extract keywords
85
+ meta_keywords = html.css("meta[name='keyword']").first
86
+ if meta_keywords
87
+ result[:keywords] = meta_keywords['content']
88
+ end
89
+
90
+ # Remove junk elements from the html
91
+ html.search("script").remove
92
+ html.search("link").remove
93
+ html.search("meta").remove
94
+ html.search("style").remove
95
+ html.search("form").remove
96
+ html.css('.noindex').remove
97
+
98
+ result[:content] = html.at('html/body').inner_text
99
+
100
+ return result
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
106
+
@@ -0,0 +1,62 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'net/http'
17
+ require 'xapian/indexer/version'
18
+
19
+ module Xapian
20
+ module Indexer
21
+
22
+ module Loaders
23
+ class HTTP
24
+ UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION::STRING}"
25
+
26
+ def initialize(options = {})
27
+ @options = options
28
+
29
+ @logger = options[:logger] || Logger.new($stderr)
30
+ end
31
+
32
+ # Extract metadata from the document, including :content and :links
33
+ def call(name, &block)
34
+ uri = URI.parse(name)
35
+
36
+ if uri.absolute?
37
+ Net::HTTP.start(uri.host, uri.port) do |http|
38
+ head = http.request_head(uri.path, 'User-Agent' => UserAgent)
39
+
40
+ body = lambda do
41
+ page = http.request_get(uri.path, 'User-Agent' => UserAgent)
42
+ page.body
43
+ end
44
+
45
+ @logger.info "Loading external URI: #{name.inspect}"
46
+
47
+ yield head.code.to_i, head.header, body
48
+ end
49
+
50
+ return true
51
+ end
52
+
53
+ return false
54
+ end
55
+ end
56
+
57
+
58
+ end
59
+
60
+ end
61
+ end
62
+
@@ -0,0 +1,165 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'digest/md5'
17
+
18
+ module Xapian
19
+ module Indexer
20
+
21
+ class Controller
22
+ def initialize(options = {})
23
+ @extractors = {}
24
+ @loaders = []
25
+
26
+ @logger = options[:logger] || Logger.new($stderr)
27
+ end
28
+
29
+ attr :loaders
30
+ attr :extractors
31
+
32
+ def create(name)
33
+ Resource.new(name, self)
34
+ end
35
+
36
+ def load(resource, &block)
37
+ @loaders.each do |loader|
38
+ loader.call(resource.name) do |status, header, load_body|
39
+ if status >= 200 && status < 300
40
+ # Process the page content
41
+ mime_type = header['content-type'].split(";").first
42
+ extractor = @extractors[mime_type]
43
+
44
+ if extractor
45
+ body = load_body.call
46
+ metadata = extractor.call(resource, status, header, body)
47
+
48
+ # Load the data into the resource
49
+ yield status, header, body, metadata
50
+
51
+ return true
52
+ else
53
+ @logger.warn "Ignoring resource #{resource.name} because content-type #{mime_type} is not supported."
54
+ return false
55
+ end
56
+ elsif status >= 300 && status < 400
57
+ # Process the redirect
58
+ location = URI.parse(resource.name) + header['location']
59
+
60
+ metadata = {
61
+ :links => [location.to_s]
62
+ }
63
+
64
+ # This resource is not indexable, using nil for body
65
+ yield status, header, nil, metadata
66
+ end
67
+ end
68
+ end
69
+
70
+ return false
71
+ end
72
+
73
+ def save(resource)
74
+ YAML::dump(resource.to_hash)
75
+ end
76
+
77
+ def recreate(data)
78
+ values = YAML::load(data)
79
+ Resource.new(values[:name], self, values)
80
+ end
81
+ end
82
+
83
+ # Represents a resource that will be indexed
84
+ class Resource
85
+ def initialize(name, controller, values = {})
86
+ @name = name
87
+ @controller = controller
88
+
89
+ @fetched_on = values[:fetched_on]
90
+ @status = values[:status]
91
+ @header = values[:header]
92
+ @body = values[:body]
93
+ @metadata = values[:metadata]
94
+ end
95
+
96
+ attr :name
97
+ attr :status
98
+ attr :header
99
+ attr :body
100
+ attr :metadata
101
+
102
+ def to_hash
103
+ {
104
+ :fetched_on => @fetched_on,
105
+ :name => @name,
106
+ :status => @status,
107
+ :header => @header,
108
+ :body => @body,
109
+ :metadata => @metadata
110
+ }
111
+ end
112
+
113
+ # The data that will be indexed
114
+ def content
115
+ [@metadata[:content] || @body, @metadata[:title], @metadata[:description], @metadata[:keywords]].compact.join(" ")
116
+ end
117
+
118
+ def links
119
+ @metadata[:links] if @metadata
120
+ end
121
+
122
+ def fresh?(at = Time.now)
123
+ cache_control = @header['cache-control'] || ""
124
+ fetched_age = @header['age'] || ""
125
+ max_age = 3600
126
+
127
+ if cache_control.match(/max-age=([0-9]+)/)
128
+ max_age = $1.to_i
129
+
130
+ if fetched_age.match(/([0-9]+)/)
131
+ max_age -= $1.to_i
132
+ end
133
+ end
134
+
135
+ age = at - @fetched_on
136
+
137
+ # If the page is younger than the max_age the page can be considered fresh.
138
+ return age < max_age
139
+ end
140
+
141
+ def fetch!
142
+ @controller.load(self) do |status, header, body, metadata|
143
+ @fetched_on = Time.now
144
+ @status = status
145
+ @header = header
146
+ @body = body
147
+ @metadata = metadata
148
+ end
149
+ end
150
+
151
+ def fetched?
152
+ @fetched_on != nil
153
+ end
154
+
155
+ def content?
156
+ @body != nil
157
+ end
158
+
159
+ def name_digest
160
+ "Q" + Digest::MD5.hexdigest(@name)
161
+ end
162
+ end
163
+
164
+ end
165
+ end
@@ -0,0 +1,182 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'xapian'
17
+ require 'set'
18
+
19
+ module Xapian
20
+ module Indexer
21
+ # Represents a process which consumes resources into the database
22
+ # and follows links to related resources
23
+ class Spider
24
+ # database = Xapian::Database.new(ARGV[0])
25
+ def initialize(database, generator, controller, options = {})
26
+ @database = database
27
+ @generator = generator
28
+ @controller = controller
29
+
30
+ @links = []
31
+ @touched = Set.new
32
+
33
+ @logger = options[:logger] || Logger.new($stdout)
34
+ end
35
+
36
+ attr :resources
37
+
38
+ def add(root)
39
+ case root
40
+ when String
41
+ @links << root
42
+ when Array
43
+ @links += root
44
+ else
45
+ @logger.error "Could not add roots #{root.inspect}!"
46
+ end
47
+ end
48
+
49
+ class Fetch
50
+ def initialize(database, controller, link)
51
+ @database = database
52
+ @controller = controller
53
+
54
+ @document = false
55
+ @current_resource = controller.create(link)
56
+ @archived_resource = false
57
+ end
58
+
59
+ attr :database
60
+ attr :controller
61
+ attr :current_resource
62
+
63
+ def document
64
+ if @document === false
65
+ postlist = @database.postlist(@current_resource.name_digest)
66
+
67
+ if postlist.size > 0
68
+ @document = @database.document(postlist[0].docid)
69
+ else
70
+ @document = nil
71
+ end
72
+ end
73
+
74
+ return @document
75
+ end
76
+
77
+ def archived_resource
78
+ if @archived_resource === false
79
+ if document
80
+ @archived_resource = @controller.recreate(document.data)
81
+ end
82
+ end
83
+
84
+ return @archived_resource
85
+ end
86
+
87
+ def links
88
+ #$stderr.puts "current_resource.links = #{@current_resource.links.inspect}" if @current_resource
89
+ #$stderr.puts "archived_resource.links = #{archived_resource.links.inspect}" if archived_resource
90
+
91
+ if @current_resource.fetched?
92
+ @current_resource.links
93
+ elsif archived_resource
94
+ archived_resource.links
95
+ end
96
+ end
97
+ end
98
+
99
+ def process(options = {}, &block)
100
+ count = 0
101
+ depth = 0
102
+
103
+ until @links.empty?
104
+ new_links = []
105
+
106
+ @links.each do |link|
107
+ # Mark and sweep - don't review the same resource twice!
108
+ next if @touched.include?(link)
109
+ @touched << link
110
+
111
+ # Create a new fetch from the database...
112
+ fetch = Fetch.new(@database, @controller, link)
113
+ resource = fetch.current_resource
114
+
115
+ # Does it already exist in the current database (and fresh?)
116
+ unless fetch.archived_resource && fetch.archived_resource.fresh?
117
+ # Fetch the resource and add it to the index
118
+ begin
119
+ @logger.info "Indexing #{resource.name}..."
120
+ resource.fetch!
121
+ rescue
122
+ @logger.error "Could not fetch resource #{resource.name}: #{$!}!"
123
+ $!.backtrace.each{|line| @logger.error(line)}
124
+ end
125
+
126
+ # Did we fetch a resource and was it indexable?
127
+ if resource.fetched?
128
+ if resource.content?
129
+ doc = Xapian::Document.new
130
+ doc.data = @controller.save(resource)
131
+ doc.add_term(resource.name_digest)
132
+
133
+ @generator.document = doc
134
+ @generator.index_text(resource.content)
135
+ @database.replace_document(resource.name_digest, doc)
136
+ else
137
+ @logger.warn "Resource was not indexable #{resource.name}!"
138
+ @logger.warn "Links = #{(fetch.links || []).map(&block).compact.inspect}"
139
+ end
140
+ else
141
+ @logger.warn "Could not fetch resource #{resource.name}!"
142
+ end
143
+ else
144
+ @logger.info "Still fresh #{resource.name}..."
145
+ end
146
+
147
+ new_links += (fetch.links || []).map(&block).compact
148
+
149
+ count += 1
150
+
151
+ if options[:count] && count > options[:count]
152
+ # If we have to leave before finishing this breadth...
153
+ @links += new_links
154
+ return count
155
+ end
156
+ end
157
+
158
+ @links = new_links
159
+
160
+ depth += 1
161
+
162
+ return count if options[:depth] && depth > options[:depth]
163
+ end
164
+ end
165
+
166
+ def remove_old!
167
+ postlist = @database.postlist("")
168
+
169
+ postlist.each do |post|
170
+ document = @database.document(post.docid)
171
+ resource = @controller.recreate(document.data)
172
+
173
+ unless resource.fresh?
174
+ @logger.info "Removing expired index for #{resource.name}."
175
+ @database.delete_document(post.docid)
176
+ end
177
+ end
178
+ end
179
+ end
180
+
181
+ end
182
+ end
@@ -0,0 +1,27 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module Xapian
17
+ module Indexer
18
+ module VERSION #:nodoc:
19
+ MAJOR = 1
20
+ MINOR = 2
21
+ TINY = 3
22
+ REV = 1
23
+
24
+ STRING = [MAJOR, MINOR, TINY, REV].join('.')
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xapian-indexer
3
+ version: !ruby/object:Gem::Version
4
+ hash: 65
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 2
9
+ - 3
10
+ - 1
11
+ version: 1.2.3.1
12
+ platform: ruby
13
+ authors:
14
+ - Samuel Williams
15
+ autorequire:
16
+ bindir: bin
17
+ cert_chain: []
18
+
19
+ date: 2010-12-19 00:00:00 +13:00
20
+ default_executable:
21
+ dependencies: []
22
+
23
+ description:
24
+ email: samuel.williams@oriontransfer.co.nz
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - lib/xapian/indexer/extensions.rb
33
+ - lib/xapian/indexer/extractors/html.rb
34
+ - lib/xapian/indexer/loaders/http.rb
35
+ - lib/xapian/indexer/resource.rb
36
+ - lib/xapian/indexer/spider.rb
37
+ - lib/xapian/indexer/version.rb
38
+ - lib/xapian/indexer.rb
39
+ has_rdoc: true
40
+ homepage: http://www.oriontransfer.co.nz/software/xapian
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project:
69
+ rubygems_version: 1.3.7
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: Xapian is a framework for fast full-text searching.
73
+ test_files: []
74
+