xapian-indexer 1.2.3.4 → 1.2.19.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/README.md +109 -0
- data/Rakefile +2 -0
- data/lib/xapian/indexer.rb +0 -3
- data/lib/xapian/indexer/extractors/html.rb +0 -1
- data/lib/xapian/indexer/loaders/http.rb +1 -5
- data/lib/xapian/indexer/resource.rb +2 -0
- data/lib/xapian/indexer/spider.rb +2 -0
- data/lib/xapian/indexer/version.rb +1 -8
- data/test/oriontransfer.rb +55 -0
- data/xapian-indexer.gemspec +25 -0
- metadata +83 -71
- data/lib/xapian/indexer/extensions.rb +0 -45
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 72b813e2b0019183e68679aebf13532f607f4adb
|
4
|
+
data.tar.gz: 3806c150659093f02f1c394b227715790bfbe218
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2583e40e2e9eeecf3247822595e8dbedc09d07fd1bc18ba64c95b72a5ff0413cc0bba51434a69b6ccb61e60669a4f05616d615e3e229f4105512e93315316e12
|
7
|
+
data.tar.gz: f77f2f725d2fb549b6c0eea955ef709390090916c090e8259c0d94474bbaaa686d340a56d7987711e755591b5e40aa757e9e2e1b88fe0651b753d0304be4b664
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
# Xapian::Indexer
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'xapian-indexer'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install xapian-indexer
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
The following example shows how to create a spider and index some resources:
|
22
|
+
|
23
|
+
#!/usr/bin/env ruby
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
|
27
|
+
require 'uri'
|
28
|
+
|
29
|
+
require 'xapian/indexer'
|
30
|
+
require 'xapian/indexer/loaders/http'
|
31
|
+
require 'xapian/indexer/extractors/html'
|
32
|
+
|
33
|
+
$stderr.sync = true
|
34
|
+
|
35
|
+
database = Xapian::WritableDatabase.new("/tmp/xapian-oriontransfer", Xapian::DB_CREATE_OR_OPEN)
|
36
|
+
generator = Xapian::TermGenerator.new()
|
37
|
+
|
38
|
+
# Setup the controller
|
39
|
+
controller = Xapian::Indexer::Controller.new
|
40
|
+
|
41
|
+
controller.loaders << Xapian::Indexer::Loaders::HTTP.new()
|
42
|
+
controller.extractors['text/html'] = Xapian::Indexer::Extractors::HTML.new()
|
43
|
+
|
44
|
+
spider = Xapian::Indexer::Spider.new(database, generator, controller)
|
45
|
+
|
46
|
+
spider.add("http://www.oriontransfer.co.nz/welcome/index")
|
47
|
+
|
48
|
+
spider.process(:depth => 2, :count => 50) do |link|
|
49
|
+
uri = URI.parse(link)
|
50
|
+
|
51
|
+
case uri.host
|
52
|
+
when "www.oriontransfer.co.nz"
|
53
|
+
link
|
54
|
+
else
|
55
|
+
$stderr.puts "Skipping #{link}"
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Start an enquire session.
|
61
|
+
enquire = Xapian::Enquire.new(database)
|
62
|
+
|
63
|
+
# Setup the query parser
|
64
|
+
qp = Xapian::QueryParser.new()
|
65
|
+
query = qp.parse_query("services")
|
66
|
+
|
67
|
+
enquire.query = query
|
68
|
+
matchset = enquire.mset(0, 10)
|
69
|
+
|
70
|
+
# Display the results.
|
71
|
+
puts "#{matchset.matches_estimated()} results found."
|
72
|
+
puts "Matches 1-#{matchset.size}:\n"
|
73
|
+
|
74
|
+
matchset.matches.each do |m|
|
75
|
+
resource = YAML::load(m.document.data)
|
76
|
+
puts "#{m.rank + 1}: #{m.percent}% docid=#{m.docid} [#{resource[:name]}]"
|
77
|
+
end
|
78
|
+
|
79
|
+
## Contributing
|
80
|
+
|
81
|
+
1. Fork it
|
82
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
83
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
84
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
85
|
+
5. Create new Pull Request
|
86
|
+
|
87
|
+
## License
|
88
|
+
|
89
|
+
Released under the MIT license.
|
90
|
+
|
91
|
+
Copyright, 2015, by [Samuel G. D. Williams](http://www.codeotaku.com/samuel-williams).
|
92
|
+
|
93
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
94
|
+
of this software and associated documentation files (the "Software"), to deal
|
95
|
+
in the Software without restriction, including without limitation the rights
|
96
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
97
|
+
copies of the Software, and to permit persons to whom the Software is
|
98
|
+
furnished to do so, subject to the following conditions:
|
99
|
+
|
100
|
+
The above copyright notice and this permission notice shall be included in
|
101
|
+
all copies or substantial portions of the Software.
|
102
|
+
|
103
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
104
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
105
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
106
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
107
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
108
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
109
|
+
THE SOFTWARE.
|
data/Rakefile
ADDED
data/lib/xapian/indexer.rb
CHANGED
@@ -18,10 +18,9 @@ require 'xapian/indexer/version'
|
|
18
18
|
|
19
19
|
module Xapian
|
20
20
|
module Indexer
|
21
|
-
|
22
21
|
module Loaders
|
23
22
|
class HTTP
|
24
|
-
UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION
|
23
|
+
UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION}"
|
25
24
|
|
26
25
|
def initialize(options = {})
|
27
26
|
@options = options
|
@@ -53,10 +52,7 @@ module Xapian
|
|
53
52
|
return false
|
54
53
|
end
|
55
54
|
end
|
56
|
-
|
57
|
-
|
58
55
|
end
|
59
|
-
|
60
56
|
end
|
61
57
|
end
|
62
58
|
|
@@ -14,6 +14,7 @@
|
|
14
14
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
15
|
|
16
16
|
require 'digest/md5'
|
17
|
+
require 'yaml'
|
17
18
|
|
18
19
|
module Xapian
|
19
20
|
module Indexer
|
@@ -76,6 +77,7 @@ module Xapian
|
|
76
77
|
|
77
78
|
def recreate(data)
|
78
79
|
values = YAML::load(data)
|
80
|
+
|
79
81
|
Resource.new(values[:name], self, values)
|
80
82
|
end
|
81
83
|
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
require 'uri'
|
6
|
+
|
7
|
+
require 'xapian/indexer'
|
8
|
+
require 'xapian/indexer/loaders/http'
|
9
|
+
require 'xapian/indexer/extractors/html'
|
10
|
+
|
11
|
+
$stderr.sync = true
|
12
|
+
|
13
|
+
database = Xapian::WritableDatabase.new("/tmp/xapian-oriontransfer", Xapian::DB_CREATE_OR_OPEN)
|
14
|
+
generator = Xapian::TermGenerator.new()
|
15
|
+
|
16
|
+
# Setup the controller
|
17
|
+
controller = Xapian::Indexer::Controller.new
|
18
|
+
|
19
|
+
controller.loaders << Xapian::Indexer::Loaders::HTTP.new()
|
20
|
+
controller.extractors['text/html'] = Xapian::Indexer::Extractors::HTML.new()
|
21
|
+
|
22
|
+
spider = Xapian::Indexer::Spider.new(database, generator, controller)
|
23
|
+
|
24
|
+
spider.add("http://www.oriontransfer.co.nz/welcome/index")
|
25
|
+
|
26
|
+
spider.process(:depth => 2, :count => 50) do |link|
|
27
|
+
uri = URI.parse(link)
|
28
|
+
|
29
|
+
case uri.host
|
30
|
+
when "www.oriontransfer.co.nz"
|
31
|
+
link
|
32
|
+
else
|
33
|
+
$stderr.puts "Skipping #{link}"
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Start an enquire session.
|
39
|
+
enquire = Xapian::Enquire.new(database)
|
40
|
+
|
41
|
+
# Setup the query parser
|
42
|
+
qp = Xapian::QueryParser.new()
|
43
|
+
query = qp.parse_query("services")
|
44
|
+
|
45
|
+
enquire.query = query
|
46
|
+
matchset = enquire.mset(0, 10)
|
47
|
+
|
48
|
+
# Display the results.
|
49
|
+
puts "#{matchset.matches_estimated()} results found."
|
50
|
+
puts "Matches 1-#{matchset.size}:\n"
|
51
|
+
|
52
|
+
matchset.matches.each do |m|
|
53
|
+
resource = YAML::load(m.document.data)
|
54
|
+
puts "#{m.rank + 1}: #{m.percent}% docid=#{m.docid} [#{resource[:name]}]"
|
55
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'xapian/indexer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "xapian-indexer"
|
8
|
+
spec.version = Xapian::Indexer::VERSION
|
9
|
+
spec.authors = ["Samuel Williams"]
|
10
|
+
spec.email = ["samuel.williams@oriontransfer.co.nz"]
|
11
|
+
spec.summary = %q{Xapian::Indexer provides a flexible spider for indexing resources.}
|
12
|
+
spec.homepage = ""
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency 'xapian-core'
|
21
|
+
spec.add_dependency 'nokogiri'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
end
|
metadata
CHANGED
@@ -1,101 +1,113 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-indexer
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease: false
|
6
|
-
segments:
|
7
|
-
- 1
|
8
|
-
- 2
|
9
|
-
- 3
|
10
|
-
- 4
|
11
|
-
version: 1.2.3.4
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.2.19.1
|
12
5
|
platform: ruby
|
13
|
-
authors:
|
6
|
+
authors:
|
14
7
|
- Samuel Williams
|
15
8
|
autorequire:
|
16
9
|
bindir: bin
|
17
10
|
cert_chain: []
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
dependencies:
|
22
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
date: 2015-01-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
23
14
|
name: xapian-core
|
24
|
-
|
25
|
-
|
26
|
-
none: false
|
27
|
-
requirements:
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
28
17
|
- - ">="
|
29
|
-
- !ruby/object:Gem::Version
|
30
|
-
|
31
|
-
segments:
|
32
|
-
- 0
|
33
|
-
version: "0"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
34
20
|
type: :runtime
|
35
|
-
version_requirements: *id001
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: nokogiri
|
38
21
|
prerelease: false
|
39
|
-
|
40
|
-
|
41
|
-
requirements:
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
42
24
|
- - ">="
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
48
34
|
type: :runtime
|
49
|
-
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
50
69
|
description:
|
51
|
-
email:
|
70
|
+
email:
|
71
|
+
- samuel.williams@oriontransfer.co.nz
|
52
72
|
executables: []
|
53
|
-
|
54
73
|
extensions: []
|
55
|
-
|
56
74
|
extra_rdoc_files: []
|
57
|
-
|
58
|
-
|
59
|
-
-
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- Gemfile
|
78
|
+
- README.md
|
79
|
+
- Rakefile
|
80
|
+
- lib/xapian/indexer.rb
|
60
81
|
- lib/xapian/indexer/extractors/html.rb
|
61
82
|
- lib/xapian/indexer/loaders/http.rb
|
62
83
|
- lib/xapian/indexer/resource.rb
|
63
84
|
- lib/xapian/indexer/spider.rb
|
64
85
|
- lib/xapian/indexer/version.rb
|
65
|
-
-
|
66
|
-
|
67
|
-
homepage:
|
68
|
-
licenses:
|
69
|
-
|
86
|
+
- test/oriontransfer.rb
|
87
|
+
- xapian-indexer.gemspec
|
88
|
+
homepage: ''
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
70
92
|
post_install_message:
|
71
93
|
rdoc_options: []
|
72
|
-
|
73
|
-
require_paths:
|
94
|
+
require_paths:
|
74
95
|
- lib
|
75
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
-
|
77
|
-
requirements:
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
78
98
|
- - ">="
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
version: "0"
|
84
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
-
none: false
|
86
|
-
requirements:
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
87
103
|
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
|
90
|
-
segments:
|
91
|
-
- 0
|
92
|
-
version: "0"
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
93
106
|
requirements: []
|
94
|
-
|
95
107
|
rubyforge_project:
|
96
|
-
rubygems_version:
|
108
|
+
rubygems_version: 2.2.2
|
97
109
|
signing_key:
|
98
|
-
specification_version:
|
110
|
+
specification_version: 4
|
99
111
|
summary: Xapian::Indexer provides a flexible spider for indexing resources.
|
100
|
-
test_files:
|
101
|
-
|
112
|
+
test_files:
|
113
|
+
- test/oriontransfer.rb
|
@@ -1,45 +0,0 @@
|
|
1
|
-
# Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
|
2
|
-
#
|
3
|
-
# This program is free software: you can redistribute it and/or modify
|
4
|
-
# it under the terms of the GNU General Public License as published by
|
5
|
-
# the Free Software Foundation, either version 3 of the License, or
|
6
|
-
# (at your option) any later version.
|
7
|
-
#
|
8
|
-
# This program is distributed in the hope that it will be useful,
|
9
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
-
# GNU General Public License for more details.
|
12
|
-
#
|
13
|
-
# You should have received a copy of the GNU General Public License
|
14
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
-
|
16
|
-
require 'uri'
|
17
|
-
|
18
|
-
class URI::Generic
|
19
|
-
def absolute_path?
|
20
|
-
path.match('^/')
|
21
|
-
end
|
22
|
-
|
23
|
-
def relative_path?
|
24
|
-
!absolute_path?
|
25
|
-
end
|
26
|
-
|
27
|
-
# Behavior in 1.8.7 seems to be broken...?
|
28
|
-
def merge0(oth)
|
29
|
-
case oth
|
30
|
-
when Generic
|
31
|
-
when String
|
32
|
-
oth = URI.parse(oth)
|
33
|
-
else
|
34
|
-
raise ArgumentError, "bad argument(expected URI object or URI string)"
|
35
|
-
end
|
36
|
-
|
37
|
-
if oth.absolute?
|
38
|
-
return oth, oth
|
39
|
-
else
|
40
|
-
return self.dup, oth
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
# puts URI.parse("/bob/dole") + URI.parse("http://www.lucidsystems.org")
|