xapian-indexer 1.2.3.4 → 1.2.19.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 72b813e2b0019183e68679aebf13532f607f4adb
4
+ data.tar.gz: 3806c150659093f02f1c394b227715790bfbe218
5
+ SHA512:
6
+ metadata.gz: 2583e40e2e9eeecf3247822595e8dbedc09d07fd1bc18ba64c95b72a5ff0413cc0bba51434a69b6ccb61e60669a4f05616d615e3e229f4105512e93315316e12
7
+ data.tar.gz: f77f2f725d2fb549b6c0eea955ef709390090916c090e8259c0d94474bbaaa686d340a56d7987711e755591b5e40aa757e9e2e1b88fe0651b753d0304be4b664
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in xapian-indexer.gemspec
4
+ gemspec
@@ -0,0 +1,109 @@
1
+ # Xapian::Indexer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'xapian-indexer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install xapian-indexer
18
+
19
+ ## Usage
20
+
21
+ The following example shows how to create a spider and index some resources:
22
+
23
+ #!/usr/bin/env ruby
24
+
25
+ require 'rubygems'
26
+
27
+ require 'uri'
28
+
29
+ require 'xapian/indexer'
30
+ require 'xapian/indexer/loaders/http'
31
+ require 'xapian/indexer/extractors/html'
32
+
33
+ $stderr.sync = true
34
+
35
+ database = Xapian::WritableDatabase.new("/tmp/xapian-oriontransfer", Xapian::DB_CREATE_OR_OPEN)
36
+ generator = Xapian::TermGenerator.new()
37
+
38
+ # Setup the controller
39
+ controller = Xapian::Indexer::Controller.new
40
+
41
+ controller.loaders << Xapian::Indexer::Loaders::HTTP.new()
42
+ controller.extractors['text/html'] = Xapian::Indexer::Extractors::HTML.new()
43
+
44
+ spider = Xapian::Indexer::Spider.new(database, generator, controller)
45
+
46
+ spider.add("http://www.oriontransfer.co.nz/welcome/index")
47
+
48
+ spider.process(:depth => 2, :count => 50) do |link|
49
+ uri = URI.parse(link)
50
+
51
+ case uri.host
52
+ when "www.oriontransfer.co.nz"
53
+ link
54
+ else
55
+ $stderr.puts "Skipping #{link}"
56
+ nil
57
+ end
58
+ end
59
+
60
+ # Start an enquire session.
61
+ enquire = Xapian::Enquire.new(database)
62
+
63
+ # Setup the query parser
64
+ qp = Xapian::QueryParser.new()
65
+ query = qp.parse_query("services")
66
+
67
+ enquire.query = query
68
+ matchset = enquire.mset(0, 10)
69
+
70
+ # Display the results.
71
+ puts "#{matchset.matches_estimated()} results found."
72
+ puts "Matches 1-#{matchset.size}:\n"
73
+
74
+ matchset.matches.each do |m|
75
+ resource = YAML::load(m.document.data)
76
+ puts "#{m.rank + 1}: #{m.percent}% docid=#{m.docid} [#{resource[:name]}]"
77
+ end
78
+
79
+ ## Contributing
80
+
81
+ 1. Fork it
82
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
83
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
84
+ 4. Push to the branch (`git push origin my-new-feature`)
85
+ 5. Create new Pull Request
86
+
87
+ ## License
88
+
89
+ Released under the MIT license.
90
+
91
+ Copyright, 2015, by [Samuel G. D. Williams](http://www.codeotaku.com/samuel-williams).
92
+
93
+ Permission is hereby granted, free of charge, to any person obtaining a copy
94
+ of this software and associated documentation files (the "Software"), to deal
95
+ in the Software without restriction, including without limitation the rights
96
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
97
+ copies of the Software, and to permit persons to whom the Software is
98
+ furnished to do so, subject to the following conditions:
99
+
100
+ The above copyright notice and this permission notice shall be included in
101
+ all copies or substantial portions of the Software.
102
+
103
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
108
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
109
+ THE SOFTWARE.
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -1,7 +1,4 @@
1
1
 
2
- require 'logger'
3
-
4
- require 'xapian/indexer/extensions'
5
2
  require 'xapian/indexer/version'
6
3
  require 'xapian/indexer/resource'
7
4
  require 'xapian/indexer/spider'
@@ -98,7 +98,6 @@ module Xapian
98
98
  html.search("form").remove
99
99
  html.css('.noindex').remove
100
100
 
101
-
102
101
  body = html.at('html/body')
103
102
 
104
103
  if body
@@ -18,10 +18,9 @@ require 'xapian/indexer/version'
18
18
 
19
19
  module Xapian
20
20
  module Indexer
21
-
22
21
  module Loaders
23
22
  class HTTP
24
- UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION::STRING}"
23
+ UserAgent = "Xapian-Spider #{Xapian::Indexer::VERSION}"
25
24
 
26
25
  def initialize(options = {})
27
26
  @options = options
@@ -53,10 +52,7 @@ module Xapian
53
52
  return false
54
53
  end
55
54
  end
56
-
57
-
58
55
  end
59
-
60
56
  end
61
57
  end
62
58
 
@@ -14,6 +14,7 @@
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
16
  require 'digest/md5'
17
+ require 'yaml'
17
18
 
18
19
  module Xapian
19
20
  module Indexer
@@ -76,6 +77,7 @@ module Xapian
76
77
 
77
78
  def recreate(data)
78
79
  values = YAML::load(data)
80
+
79
81
  Resource.new(values[:name], self, values)
80
82
  end
81
83
  end
@@ -16,6 +16,8 @@
16
16
  require 'xapian'
17
17
  require 'set'
18
18
 
19
+ require 'logger'
20
+
19
21
  module Xapian
20
22
  module Indexer
21
23
  # Represents a process which consumes resources into the database
@@ -15,13 +15,6 @@
15
15
 
16
16
  module Xapian
17
17
  module Indexer
18
- module VERSION #:nodoc:
19
- MAJOR = 1
20
- MINOR = 2
21
- TINY = 3
22
- REV = 4
23
-
24
- STRING = [MAJOR, MINOR, TINY, REV].join('.')
25
- end
18
+ VERSION = "1.2.19.1"
26
19
  end
27
20
  end
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+
5
+ require 'uri'
6
+
7
+ require 'xapian/indexer'
8
+ require 'xapian/indexer/loaders/http'
9
+ require 'xapian/indexer/extractors/html'
10
+
11
+ $stderr.sync = true
12
+
13
+ database = Xapian::WritableDatabase.new("/tmp/xapian-oriontransfer", Xapian::DB_CREATE_OR_OPEN)
14
+ generator = Xapian::TermGenerator.new()
15
+
16
+ # Setup the controller
17
+ controller = Xapian::Indexer::Controller.new
18
+
19
+ controller.loaders << Xapian::Indexer::Loaders::HTTP.new()
20
+ controller.extractors['text/html'] = Xapian::Indexer::Extractors::HTML.new()
21
+
22
+ spider = Xapian::Indexer::Spider.new(database, generator, controller)
23
+
24
+ spider.add("http://www.oriontransfer.co.nz/welcome/index")
25
+
26
+ spider.process(:depth => 2, :count => 50) do |link|
27
+ uri = URI.parse(link)
28
+
29
+ case uri.host
30
+ when "www.oriontransfer.co.nz"
31
+ link
32
+ else
33
+ $stderr.puts "Skipping #{link}"
34
+ nil
35
+ end
36
+ end
37
+
38
+ # Start an enquire session.
39
+ enquire = Xapian::Enquire.new(database)
40
+
41
+ # Setup the query parser
42
+ qp = Xapian::QueryParser.new()
43
+ query = qp.parse_query("services")
44
+
45
+ enquire.query = query
46
+ matchset = enquire.mset(0, 10)
47
+
48
+ # Display the results.
49
+ puts "#{matchset.matches_estimated()} results found."
50
+ puts "Matches 1-#{matchset.size}:\n"
51
+
52
+ matchset.matches.each do |m|
53
+ resource = YAML::load(m.document.data)
54
+ puts "#{m.rank + 1}: #{m.percent}% docid=#{m.docid} [#{resource[:name]}]"
55
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'xapian/indexer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "xapian-indexer"
8
+ spec.version = Xapian::Indexer::VERSION
9
+ spec.authors = ["Samuel Williams"]
10
+ spec.email = ["samuel.williams@oriontransfer.co.nz"]
11
+ spec.summary = %q{Xapian::Indexer provides a flexible spider for indexing resources.}
12
+ spec.homepage = ""
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_dependency 'xapian-core'
21
+ spec.add_dependency 'nokogiri'
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ end
metadata CHANGED
@@ -1,101 +1,113 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: xapian-indexer
3
- version: !ruby/object:Gem::Version
4
- hash: 75
5
- prerelease: false
6
- segments:
7
- - 1
8
- - 2
9
- - 3
10
- - 4
11
- version: 1.2.3.4
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.2.19.1
12
5
  platform: ruby
13
- authors:
6
+ authors:
14
7
  - Samuel Williams
15
8
  autorequire:
16
9
  bindir: bin
17
10
  cert_chain: []
18
-
19
- date: 2011-01-24 00:00:00 +13:00
20
- default_executable:
21
- dependencies:
22
- - !ruby/object:Gem::Dependency
11
+ date: 2015-01-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
23
14
  name: xapian-core
24
- prerelease: false
25
- requirement: &id001 !ruby/object:Gem::Requirement
26
- none: false
27
- requirements:
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
28
17
  - - ">="
29
- - !ruby/object:Gem::Version
30
- hash: 3
31
- segments:
32
- - 0
33
- version: "0"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
34
20
  type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: nokogiri
38
21
  prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
42
24
  - - ">="
43
- - !ruby/object:Gem::Version
44
- hash: 3
45
- segments:
46
- - 0
47
- version: "0"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
48
34
  type: :runtime
49
- version_requirements: *id002
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
50
69
  description:
51
- email: samuel.williams@oriontransfer.co.nz
70
+ email:
71
+ - samuel.williams@oriontransfer.co.nz
52
72
  executables: []
53
-
54
73
  extensions: []
55
-
56
74
  extra_rdoc_files: []
57
-
58
- files:
59
- - lib/xapian/indexer/extensions.rb
75
+ files:
76
+ - ".gitignore"
77
+ - Gemfile
78
+ - README.md
79
+ - Rakefile
80
+ - lib/xapian/indexer.rb
60
81
  - lib/xapian/indexer/extractors/html.rb
61
82
  - lib/xapian/indexer/loaders/http.rb
62
83
  - lib/xapian/indexer/resource.rb
63
84
  - lib/xapian/indexer/spider.rb
64
85
  - lib/xapian/indexer/version.rb
65
- - lib/xapian/indexer.rb
66
- has_rdoc: true
67
- homepage: http://www.oriontransfer.co.nz/software/xapian
68
- licenses: []
69
-
86
+ - test/oriontransfer.rb
87
+ - xapian-indexer.gemspec
88
+ homepage: ''
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
70
92
  post_install_message:
71
93
  rdoc_options: []
72
-
73
- require_paths:
94
+ require_paths:
74
95
  - lib
75
- required_ruby_version: !ruby/object:Gem::Requirement
76
- none: false
77
- requirements:
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
78
98
  - - ">="
79
- - !ruby/object:Gem::Version
80
- hash: 3
81
- segments:
82
- - 0
83
- version: "0"
84
- required_rubygems_version: !ruby/object:Gem::Requirement
85
- none: false
86
- requirements:
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
87
103
  - - ">="
88
- - !ruby/object:Gem::Version
89
- hash: 3
90
- segments:
91
- - 0
92
- version: "0"
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
93
106
  requirements: []
94
-
95
107
  rubyforge_project:
96
- rubygems_version: 1.3.7
108
+ rubygems_version: 2.2.2
97
109
  signing_key:
98
- specification_version: 3
110
+ specification_version: 4
99
111
  summary: Xapian::Indexer provides a flexible spider for indexing resources.
100
- test_files: []
101
-
112
+ test_files:
113
+ - test/oriontransfer.rb
@@ -1,45 +0,0 @@
1
- # Copyright (c) 2010 Samuel Williams. Released under the GNU GPLv3.
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require 'uri'
17
-
18
- class URI::Generic
19
- def absolute_path?
20
- path.match('^/')
21
- end
22
-
23
- def relative_path?
24
- !absolute_path?
25
- end
26
-
27
- # Behavior in 1.8.7 seems to be broken...?
28
- def merge0(oth)
29
- case oth
30
- when Generic
31
- when String
32
- oth = URI.parse(oth)
33
- else
34
- raise ArgumentError, "bad argument(expected URI object or URI string)"
35
- end
36
-
37
- if oth.absolute?
38
- return oth, oth
39
- else
40
- return self.dup, oth
41
- end
42
- end
43
- end
44
-
45
- # puts URI.parse("/bob/dole") + URI.parse("http://www.lucidsystems.org")