xapian-indexer 1.2.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+
2
+ require 'logger'
3
+
4
+ require 'xapian/indexer/extensions'
5
+ require 'xapian/indexer/version'
6
+ require 'xapian/indexer/resource'
7
+ require 'xapian/indexer/spider'
8
+
9
+ module Xapian
10
+ module Indexer
11
+ end
12
+ end
@@ -0,0 +1,45 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'uri'
17
+
18
+ class URI::Generic
19
+ def absolute_path?
20
+ path.match('^/')
21
+ end
22
+
23
+ def relative_path?
24
+ !absolute_path?
25
+ end
26
+
27
+ # Behavior in 1.8.7 seems to be broken...?
28
+ def merge0(oth)
29
+ case oth
30
+ when Generic
31
+ when String
32
+ oth = URI.parse(oth)
33
+ else
34
+ raise ArgumentError, "bad argument(expected URI object or URI string)"
35
+ end
36
+
37
+ if oth.absolute?
38
+ return oth, oth
39
+ else
40
+ return self.dup, oth
41
+ end
42
+ end
43
+ end
44
+
45
+ # puts URI.parse("/bob/dole") + URI.parse("http://www.lucidsystems.org")
@@ -0,0 +1,106 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'nokogiri'
17
+
18
+ module Xapian
19
+ module Indexer
20
+ module Extractors
21
+ # Represents a resource that will be indexed
22
+ class HTML
23
+ def initialize(options = {})
24
+ @options = options
25
+
26
+ @logger = options[:logger] || Logger.new($stderr)
27
+ end
28
+
29
+ def call(resource, status, headers, data)
30
+ html = Nokogiri::HTML.parse(data)
31
+ result = {}
32
+
33
+ # Extract description
34
+ meta_description = html.css("meta[name='description']").first
35
+
36
+ if meta_description
37
+ result[:description] = meta_description['content']
38
+ else
39
+ # Use the first paragraph as a description
40
+ first_paragraph = html.search("p").first
41
+
42
+ if first_paragraph
43
+ result[:description] = first_paragraph.inner_text
44
+ end
45
+ end
46
+
47
+ base_tag = html.at('html/head/base')
48
+ if base_tag
49
+ base = URI.parse(base_tag['href'])
50
+ else
51
+ base = URI.parse(resource.name)
52
+ end
53
+
54
+ links = []
55
+
56
+ html.css('a').each do |link|
57
+ href = (link['href'] || "").to_s.gsub(/ /, '%20')
58
+
59
+ # No scheme but starts with a '/'
60
+ #begin
61
+ links << (base + href)
62
+ #rescue
63
+ # $stderr.puts "Could not add link #{href}: #{$!}"
64
+ #end
65
+ end
66
+
67
+ # Remove any fragment at the end of the URI.
68
+ links.each{|link| link.fragment = nil}
69
+
70
+ # Convert to strings and uniq.
71
+ result[:links] = links.map{|link| link.to_s}.uniq
72
+
73
+ #$stderr.puts "Extracted links = #{result[:links].inspect}"
74
+
75
+ # Extract title
76
+ title_tag = html.at('html/head/title')
77
+ h1_tag = html.search('h1').first
78
+ if title_tag
79
+ result[:title] = title_tag.inner_text
80
+ elsif h1_tag
81
+ result[:title] = h1_tag.inner_text
82
+ end
83
+
84
+ # Extract keywords
85
+ meta_keywords = html.css("meta[name='keyword']").first
86
+ if meta_keywords
87
+ result[:keywords] = meta_keywords['content']
88
+ end
89
+
90
+ # Remove junk elements from the html
91
+ html.search("script").remove
92
+ html.search("link").remove
93
+ html.search("meta").remove
94
+ html.search("style").remove
95
+ html.search("form").remove
96
+ html.css('.noindex').remove
97
+
98
+ result[:content] = html.at('html/body').inner_text
99
+
100
+ return result
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
106
+
@@ -0,0 +1,62 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'net/http'
17
+ require 'xapian/indexer/version'
18
+
19
+ module Xapian
20
+ module Indexer
21
+
22
+ module Loaders
23
+ class HTTP
24
+ UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION::STRING}"
25
+
26
+ def initialize(options = {})
27
+ @options = options
28
+
29
+ @logger = options[:logger] || Logger.new($stderr)
30
+ end
31
+
32
+ # Extract metadata from the document, including :content and :links
33
+ def call(name, &block)
34
+ uri = URI.parse(name)
35
+
36
+ if uri.absolute?
37
+ Net::HTTP.start(uri.host, uri.port) do |http|
38
+ head = http.request_head(uri.path, 'User-Agent' => UserAgent)
39
+
40
+ body = lambda do
41
+ page = http.request_get(uri.path, 'User-Agent' => UserAgent)
42
+ page.body
43
+ end
44
+
45
+ @logger.info "Loading external URI: #{name.inspect}"
46
+
47
+ yield head.code.to_i, head.header, body
48
+ end
49
+
50
+ return true
51
+ end
52
+
53
+ return false
54
+ end
55
+ end
56
+
57
+
58
+ end
59
+
60
+ end
61
+ end
62
+
@@ -0,0 +1,165 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'digest/md5'
17
+
18
+ module Xapian
19
+ module Indexer
20
+
21
+ class Controller
22
+ def initialize(options = {})
23
+ @extractors = {}
24
+ @loaders = []
25
+
26
+ @logger = options[:logger] || Logger.new($stderr)
27
+ end
28
+
29
+ attr :loaders
30
+ attr :extractors
31
+
32
+ def create(name)
33
+ Resource.new(name, self)
34
+ end
35
+
36
+ def load(resource, &block)
37
+ @loaders.each do |loader|
38
+ loader.call(resource.name) do |status, header, load_body|
39
+ if status >= 200 && status < 300
40
+ # Process the page content
41
+ mime_type = header['content-type'].split(";").first
42
+ extractor = @extractors[mime_type]
43
+
44
+ if extractor
45
+ body = load_body.call
46
+ metadata = extractor.call(resource, status, header, body)
47
+
48
+ # Load the data into the resource
49
+ yield status, header, body, metadata
50
+
51
+ return true
52
+ else
53
+ @logger.warn "Ignoring resource #{resource.name} because content-type #{mime_type} is not supported."
54
+ return false
55
+ end
56
+ elsif status >= 300 && status < 400
57
+ # Process the redirect
58
+ location = URI.parse(resource.name) + header['location']
59
+
60
+ metadata = {
61
+ :links => [location.to_s]
62
+ }
63
+
64
+ # This resource is not indexable, using nil for body
65
+ yield status, header, nil, metadata
66
+ end
67
+ end
68
+ end
69
+
70
+ return false
71
+ end
72
+
73
+ def save(resource)
74
+ YAML::dump(resource.to_hash)
75
+ end
76
+
77
+ def recreate(data)
78
+ values = YAML::load(data)
79
+ Resource.new(values[:name], self, values)
80
+ end
81
+ end
82
+
83
+ # Represents a resource that will be indexed
84
+ class Resource
85
+ def initialize(name, controller, values = {})
86
+ @name = name
87
+ @controller = controller
88
+
89
+ @fetched_on = values[:fetched_on]
90
+ @status = values[:status]
91
+ @header = values[:header]
92
+ @body = values[:body]
93
+ @metadata = values[:metadata]
94
+ end
95
+
96
+ attr :name
97
+ attr :status
98
+ attr :header
99
+ attr :body
100
+ attr :metadata
101
+
102
+ def to_hash
103
+ {
104
+ :fetched_on => @fetched_on,
105
+ :name => @name,
106
+ :status => @status,
107
+ :header => @header,
108
+ :body => @body,
109
+ :metadata => @metadata
110
+ }
111
+ end
112
+
113
+ # The data that will be indexed
114
+ def content
115
+ [@metadata[:content] || @body, @metadata[:title], @metadata[:description], @metadata[:keywords]].compact.join(" ")
116
+ end
117
+
118
+ def links
119
+ @metadata[:links] if @metadata
120
+ end
121
+
122
+ def fresh?(at = Time.now)
123
+ cache_control = @header['cache-control'] || ""
124
+ fetched_age = @header['age'] || ""
125
+ max_age = 3600
126
+
127
+ if cache_control.match(/max-age=([0-9]+)/)
128
+ max_age = $1.to_i
129
+
130
+ if fetched_age.match(/([0-9]+)/)
131
+ max_age -= $1.to_i
132
+ end
133
+ end
134
+
135
+ age = at - @fetched_on
136
+
137
+ # If the page is younger than the max_age the page can be considered fresh.
138
+ return age < max_age
139
+ end
140
+
141
+ def fetch!
142
+ @controller.load(self) do |status, header, body, metadata|
143
+ @fetched_on = Time.now
144
+ @status = status
145
+ @header = header
146
+ @body = body
147
+ @metadata = metadata
148
+ end
149
+ end
150
+
151
+ def fetched?
152
+ @fetched_on != nil
153
+ end
154
+
155
+ def content?
156
+ @body != nil
157
+ end
158
+
159
+ def name_digest
160
+ "Q" + Digest::MD5.hexdigest(@name)
161
+ end
162
+ end
163
+
164
+ end
165
+ end
@@ -0,0 +1,182 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require 'xapian'
17
+ require 'set'
18
+
19
+ module Xapian
20
+ module Indexer
21
+ # Represents a process which consumes resources into the database
22
+ # and follows links to related resources
23
+ class Spider
24
+ # database = Xapian::Database.new(ARGV[0])
25
+ def initialize(database, generator, controller, options = {})
26
+ @database = database
27
+ @generator = generator
28
+ @controller = controller
29
+
30
+ @links = []
31
+ @touched = Set.new
32
+
33
+ @logger = options[:logger] || Logger.new($stdout)
34
+ end
35
+
36
+ attr :resources
37
+
38
+ def add(root)
39
+ case root
40
+ when String
41
+ @links << root
42
+ when Array
43
+ @links += root
44
+ else
45
+ @logger.error "Could not add roots #{root.inspect}!"
46
+ end
47
+ end
48
+
49
+ class Fetch
50
+ def initialize(database, controller, link)
51
+ @database = database
52
+ @controller = controller
53
+
54
+ @document = false
55
+ @current_resource = controller.create(link)
56
+ @archived_resource = false
57
+ end
58
+
59
+ attr :database
60
+ attr :controller
61
+ attr :current_resource
62
+
63
+ def document
64
+ if @document === false
65
+ postlist = @database.postlist(@current_resource.name_digest)
66
+
67
+ if postlist.size > 0
68
+ @document = @database.document(postlist[0].docid)
69
+ else
70
+ @document = nil
71
+ end
72
+ end
73
+
74
+ return @document
75
+ end
76
+
77
+ def archived_resource
78
+ if @archived_resource === false
79
+ if document
80
+ @archived_resource = @controller.recreate(document.data)
81
+ end
82
+ end
83
+
84
+ return @archived_resource
85
+ end
86
+
87
+ def links
88
+ #$stderr.puts "current_resource.links = #{@current_resource.links.inspect}" if @current_resource
89
+ #$stderr.puts "archived_resource.links = #{archived_resource.links.inspect}" if archived_resource
90
+
91
+ if @current_resource.fetched?
92
+ @current_resource.links
93
+ elsif archived_resource
94
+ archived_resource.links
95
+ end
96
+ end
97
+ end
98
+
99
+ def process(options = {}, &block)
100
+ count = 0
101
+ depth = 0
102
+
103
+ until @links.empty?
104
+ new_links = []
105
+
106
+ @links.each do |link|
107
+ # Mark and sweep - don't review the same resource twice!
108
+ next if @touched.include?(link)
109
+ @touched << link
110
+
111
+ # Create a new fetch from the database...
112
+ fetch = Fetch.new(@database, @controller, link)
113
+ resource = fetch.current_resource
114
+
115
+ # Does it already exist in the current database (and fresh?)
116
+ unless fetch.archived_resource && fetch.archived_resource.fresh?
117
+ # Fetch the resource and add it to the index
118
+ begin
119
+ @logger.info "Indexing #{resource.name}..."
120
+ resource.fetch!
121
+ rescue
122
+ @logger.error "Could not fetch resource #{resource.name}: #{$!}!"
123
+ $!.backtrace.each{|line| @logger.error(line)}
124
+ end
125
+
126
+ # Did we fetch a resource and was it indexable?
127
+ if resource.fetched?
128
+ if resource.content?
129
+ doc = Xapian::Document.new
130
+ doc.data = @controller.save(resource)
131
+ doc.add_term(resource.name_digest)
132
+
133
+ @generator.document = doc
134
+ @generator.index_text(resource.content)
135
+ @database.replace_document(resource.name_digest, doc)
136
+ else
137
+ @logger.warn "Resource was not indexable #{resource.name}!"
138
+ @logger.warn "Links = #{(fetch.links || []).map(&block).compact.inspect}"
139
+ end
140
+ else
141
+ @logger.warn "Could not fetch resource #{resource.name}!"
142
+ end
143
+ else
144
+ @logger.info "Still fresh #{resource.name}..."
145
+ end
146
+
147
+ new_links += (fetch.links || []).map(&block).compact
148
+
149
+ count += 1
150
+
151
+ if options[:count] && count > options[:count]
152
+ # If we have to leave before finishing this breadth...
153
+ @links += new_links
154
+ return count
155
+ end
156
+ end
157
+
158
+ @links = new_links
159
+
160
+ depth += 1
161
+
162
+ return count if options[:depth] && depth > options[:depth]
163
+ end
164
+ end
165
+
166
+ def remove_old!
167
+ postlist = @database.postlist("")
168
+
169
+ postlist.each do |post|
170
+ document = @database.document(post.docid)
171
+ resource = @controller.recreate(document.data)
172
+
173
+ unless resource.fresh?
174
+ @logger.info "Removing expired index for #{resource.name}."
175
+ @database.delete_document(post.docid)
176
+ end
177
+ end
178
+ end
179
+ end
180
+
181
+ end
182
+ end
@@ -0,0 +1,27 @@
1
+ # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module Xapian
17
+ module Indexer
18
+ module VERSION #:nodoc:
19
+ MAJOR = 1
20
+ MINOR = 2
21
+ TINY = 3
22
+ REV = 1
23
+
24
+ STRING = [MAJOR, MINOR, TINY, REV].join('.')
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xapian-indexer
3
+ version: !ruby/object:Gem::Version
4
+ hash: 65
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 2
9
+ - 3
10
+ - 1
11
+ version: 1.2.3.1
12
+ platform: ruby
13
+ authors:
14
+ - Samuel Williams
15
+ autorequire:
16
+ bindir: bin
17
+ cert_chain: []
18
+
19
+ date: 2010-12-19 00:00:00 +13:00
20
+ default_executable:
21
+ dependencies: []
22
+
23
+ description:
24
+ email: samuel.williams@oriontransfer.co.nz
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - lib/xapian/indexer/extensions.rb
33
+ - lib/xapian/indexer/extractors/html.rb
34
+ - lib/xapian/indexer/loaders/http.rb
35
+ - lib/xapian/indexer/resource.rb
36
+ - lib/xapian/indexer/spider.rb
37
+ - lib/xapian/indexer/version.rb
38
+ - lib/xapian/indexer.rb
39
+ has_rdoc: true
40
+ homepage: http://www.oriontransfer.co.nz/software/xapian
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project:
69
+ rubygems_version: 1.3.7
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: Xapian is a framework for fast full-text searching.
73
+ test_files: []
74
+