xapian-indexer 1.2.3.4 → 1.2.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 72b813e2b0019183e68679aebf13532f607f4adb
4
+ data.tar.gz: 3806c150659093f02f1c394b227715790bfbe218
5
+ SHA512:
6
+ metadata.gz: 2583e40e2e9eeecf3247822595e8dbedc09d07fd1bc18ba64c95b72a5ff0413cc0bba51434a69b6ccb61e60669a4f05616d615e3e229f4105512e93315316e12
7
+ data.tar.gz: f77f2f725d2fb549b6c0eea955ef709390090916c090e8259c0d94474bbaaa686d340a56d7987711e755591b5e40aa757e9e2e1b88fe0651b753d0304be4b664
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in xapian-indexer.gemspec
4
+ gemspec
@@ -0,0 +1,109 @@
1
+ # Xapian::Indexer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'xapian-indexer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install xapian-indexer
18
+
19
+ ## Usage
20
+
21
+ The following example shows how to create a spider and index some resources:
22
+
23
+ #!/usr/bin/env ruby
24
+
25
+ require 'rubygems'
26
+
27
+ require 'uri'
28
+
29
+ require 'xapian/indexer'
30
+ require 'xapian/indexer/loaders/http'
31
+ require 'xapian/indexer/extractors/html'
32
+
33
+ $stderr.sync = true
34
+
35
+ database = Xapian::WritableDatabase.new("/tmp/xapian-oriontransfer", Xapian::DB_CREATE_OR_OPEN)
36
+ generator = Xapian::TermGenerator.new()
37
+
38
+ # Setup the controller
39
+ controller = Xapian::Indexer::Controller.new
40
+
41
+ controller.loaders << Xapian::Indexer::Loaders::HTTP.new()
42
+ controller.extractors['text/html'] = Xapian::Indexer::Extractors::HTML.new()
43
+
44
+ spider = Xapian::Indexer::Spider.new(database, generator, controller)
45
+
46
+ spider.add("http://www.oriontransfer.co.nz/welcome/index")
47
+
48
+ spider.process(:depth => 2, :count => 50) do |link|
49
+ uri = URI.parse(link)
50
+
51
+ case uri.host
52
+ when "www.oriontransfer.co.nz"
53
+ link
54
+ else
55
+ $stderr.puts "Skipping #{link}"
56
+ nil
57
+ end
58
+ end
59
+
60
+ # Start an enquire session.
61
+ enquire = Xapian::Enquire.new(database)
62
+
63
+ # Setup the query parser
64
+ qp = Xapian::QueryParser.new()
65
+ query = qp.parse_query("services")
66
+
67
+ enquire.query = query
68
+ matchset = enquire.mset(0, 10)
69
+
70
+ # Display the results.
71
+ puts "#{matchset.matches_estimated()} results found."
72
+ puts "Matches 1-#{matchset.size}:\n"
73
+
74
+ matchset.matches.each do |m|
75
+ resource = YAML::load(m.document.data)
76
+ puts "#{m.rank + 1}: #{m.percent}% docid=#{m.docid} [#{resource[:name]}]"
77
+ end
78
+
79
+ ## Contributing
80
+
81
+ 1. Fork it
82
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
83
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
84
+ 4. Push to the branch (`git push origin my-new-feature`)
85
+ 5. Create new Pull Request
86
+
87
+ ## License
88
+
89
+ Released under the MIT license.
90
+
91
+ Copyright, 2015, by [Samuel G. D. Williams](http://www.codeotaku.com/samuel-williams).
92
+
93
+ Permission is hereby granted, free of charge, to any person obtaining a copy
94
+ of this software and associated documentation files (the "Software"), to deal
95
+ in the Software without restriction, including without limitation the rights
96
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
97
+ copies of the Software, and to permit persons to whom the Software is
98
+ furnished to do so, subject to the following conditions:
99
+
100
+ The above copyright notice and this permission notice shall be included in
101
+ all copies or substantial portions of the Software.
102
+
103
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
108
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
109
+ THE SOFTWARE.
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -1,7 +1,4 @@
1
1
 
2
- require 'logger'
3
-
4
- require 'xapian/indexer/extensions'
5
2
  require 'xapian/indexer/version'
6
3
  require 'xapian/indexer/resource'
7
4
  require 'xapian/indexer/spider'
@@ -98,7 +98,6 @@ module Xapian
98
98
  html.search("form").remove
99
99
  html.css('.noindex').remove
100
100
 
101
-
102
101
  body = html.at('html/body')
103
102
 
104
103
  if body
@@ -18,10 +18,9 @@ require 'xapian/indexer/version'
18
18
 
19
19
  module Xapian
20
20
  module Indexer
21
-
22
21
  module Loaders
23
22
  class HTTP
24
- UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION::STRING}"
23
+ UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION}"
25
24
 
26
25
  def initialize(options = {})
27
26
  @options = options
@@ -53,10 +52,7 @@ module Xapian
53
52
  return false
54
53
  end
55
54
  end
56
-
57
-
58
55
  end
59
-
60
56
  end
61
57
  end
62
58
 
@@ -14,6 +14,7 @@
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
16
  require 'digest/md5'
17
+ require 'yaml'
17
18
 
18
19
  module Xapian
19
20
  module Indexer
@@ -76,6 +77,7 @@ module Xapian
76
77
 
77
78
  def recreate(data)
78
79
  values = YAML::load(data)
80
+
79
81
  Resource.new(values[:name], self, values)
80
82
  end
81
83
  end
@@ -16,6 +16,8 @@
16
16
  require 'xapian'
17
17
  require 'set'
18
18
 
19
+ require 'logger'
20
+
19
21
  module Xapian
20
22
  module Indexer
21
23
  # Represents a process which consumes resources into the database
@@ -15,13 +15,6 @@
15
15
 
16
16
  module Xapian
17
17
  module Indexer
18
- module VERSION #:nodoc:
19
- MAJOR = 1
20
- MINOR = 2
21
- TINY = 3
22
- REV = 4
23
-
24
- STRING = [MAJOR, MINOR, TINY, REV].join('.')
25
- end
18
+ VERSION = "1.2.19.1"
26
19
  end
27
20
  end
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+
5
+ require 'uri'
6
+
7
+ require 'xapian/indexer'
8
+ require 'xapian/indexer/loaders/http'
9
+ require 'xapian/indexer/extractors/html'
10
+
11
+ $stderr.sync = true
12
+
13
+ database = Xapian::WritableDatabase.new("/tmp/xapian-oriontransfer", Xapian::DB_CREATE_OR_OPEN)
14
+ generator = Xapian::TermGenerator.new()
15
+
16
+ # Setup the controller
17
+ controller = Xapian::Indexer::Controller.new
18
+
19
+ controller.loaders << Xapian::Indexer::Loaders::HTTP.new()
20
+ controller.extractors['text/html'] = Xapian::Indexer::Extractors::HTML.new()
21
+
22
+ spider = Xapian::Indexer::Spider.new(database, generator, controller)
23
+
24
+ spider.add("http://www.oriontransfer.co.nz/welcome/index")
25
+
26
+ spider.process(:depth => 2, :count => 50) do |link|
27
+ uri = URI.parse(link)
28
+
29
+ case uri.host
30
+ when "www.oriontransfer.co.nz"
31
+ link
32
+ else
33
+ $stderr.puts "Skipping #{link}"
34
+ nil
35
+ end
36
+ end
37
+
38
+ # Start an enquire session.
39
+ enquire = Xapian::Enquire.new(database)
40
+
41
+ # Setup the query parser
42
+ qp = Xapian::QueryParser.new()
43
+ query = qp.parse_query("services")
44
+
45
+ enquire.query = query
46
+ matchset = enquire.mset(0, 10)
47
+
48
+ # Display the results.
49
+ puts "#{matchset.matches_estimated()} results found."
50
+ puts "Matches 1-#{matchset.size}:\n"
51
+
52
+ matchset.matches.each do |m|
53
+ resource = YAML::load(m.document.data)
54
+ puts "#{m.rank + 1}: #{m.percent}% docid=#{m.docid} [#{resource[:name]}]"
55
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'xapian/indexer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "xapian-indexer"
8
+ spec.version = Xapian::Indexer::VERSION
9
+ spec.authors = ["Samuel Williams"]
10
+ spec.email = ["samuel.williams@oriontransfer.co.nz"]
11
+ spec.summary = %q{Xapian::Indexer provides a flexible spider for indexing resources.}
12
+ spec.homepage = ""
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_dependency 'xapian-core'
21
+ spec.add_dependency 'nokogiri'
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ end
metadata CHANGED
@@ -1,101 +1,113 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: xapian-indexer
3
- version: !ruby/object:Gem::Version
4
- hash: 75
5
- prerelease: false
6
- segments:
7
- - 1
8
- - 2
9
- - 3
10
- - 4
11
- version: 1.2.3.4
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.2.19.1
12
5
  platform: ruby
13
- authors:
6
+ authors:
14
7
  - Samuel Williams
15
8
  autorequire:
16
9
  bindir: bin
17
10
  cert_chain: []
18
-
19
- date: 2011-01-24 00:00:00 +13:00
20
- default_executable:
21
- dependencies:
22
- - !ruby/object:Gem::Dependency
11
+ date: 2015-01-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
23
14
  name: xapian-core
24
- prerelease: false
25
- requirement: &id001 !ruby/object:Gem::Requirement
26
- none: false
27
- requirements:
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
28
17
  - - ">="
29
- - !ruby/object:Gem::Version
30
- hash: 3
31
- segments:
32
- - 0
33
- version: "0"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
34
20
  type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: nokogiri
38
21
  prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
42
24
  - - ">="
43
- - !ruby/object:Gem::Version
44
- hash: 3
45
- segments:
46
- - 0
47
- version: "0"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
48
34
  type: :runtime
49
- version_requirements: *id002
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
50
69
  description:
51
- email: samuel.williams@oriontransfer.co.nz
70
+ email:
71
+ - samuel.williams@oriontransfer.co.nz
52
72
  executables: []
53
-
54
73
  extensions: []
55
-
56
74
  extra_rdoc_files: []
57
-
58
- files:
59
- - lib/xapian/indexer/extensions.rb
75
+ files:
76
+ - ".gitignore"
77
+ - Gemfile
78
+ - README.md
79
+ - Rakefile
80
+ - lib/xapian/indexer.rb
60
81
  - lib/xapian/indexer/extractors/html.rb
61
82
  - lib/xapian/indexer/loaders/http.rb
62
83
  - lib/xapian/indexer/resource.rb
63
84
  - lib/xapian/indexer/spider.rb
64
85
  - lib/xapian/indexer/version.rb
65
- - lib/xapian/indexer.rb
66
- has_rdoc: true
67
- homepage: http://www.oriontransfer.co.nz/software/xapian
68
- licenses: []
69
-
86
+ - test/oriontransfer.rb
87
+ - xapian-indexer.gemspec
88
+ homepage: ''
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
70
92
  post_install_message:
71
93
  rdoc_options: []
72
-
73
- require_paths:
94
+ require_paths:
74
95
  - lib
75
- required_ruby_version: !ruby/object:Gem::Requirement
76
- none: false
77
- requirements:
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
78
98
  - - ">="
79
- - !ruby/object:Gem::Version
80
- hash: 3
81
- segments:
82
- - 0
83
- version: "0"
84
- required_rubygems_version: !ruby/object:Gem::Requirement
85
- none: false
86
- requirements:
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
87
103
  - - ">="
88
- - !ruby/object:Gem::Version
89
- hash: 3
90
- segments:
91
- - 0
92
- version: "0"
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
93
106
  requirements: []
94
-
95
107
  rubyforge_project:
96
- rubygems_version: 1.3.7
108
+ rubygems_version: 2.2.2
97
109
  signing_key:
98
- specification_version: 3
110
+ specification_version: 4
99
111
  summary: Xapian::Indexer provides a flexible spider for indexing resources.
100
- test_files: []
101
-
112
+ test_files:
113
+ - test/oriontransfer.rb
@@ -1,45 +0,0 @@
1
- # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require 'uri'
17
-
18
- class URI::Generic
19
- def absolute_path?
20
- path.match('^/')
21
- end
22
-
23
- def relative_path?
24
- !absolute_path?
25
- end
26
-
27
- # Behavior in 1.8.7 seems to be broken...?
28
- def merge0(oth)
29
- case oth
30
- when Generic
31
- when String
32
- oth = URI.parse(oth)
33
- else
34
- raise ArgumentError, "bad argument(expected URI object or URI string)"
35
- end
36
-
37
- if oth.absolute?
38
- return oth, oth
39
- else
40
- return self.dup, oth
41
- end
42
- end
43
- end
44
-
45
- # puts URI.parse("/bob/dole") + URI.parse("http://www.lucidsystems.org")