xapian-indexer 1.2.19.1 → 1.2.19.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/xapian/indexer.rb +24 -3
- data/lib/xapian/indexer/extensions/uri.rb +41 -0
- data/lib/xapian/indexer/extractors/html.rb +17 -12
- data/lib/xapian/indexer/loaders/http.rb +18 -13
- data/lib/xapian/indexer/resource.rb +17 -12
- data/lib/xapian/indexer/spider.rb +17 -12
- data/lib/xapian/indexer/version.rb +18 -13
- data/xapian-indexer.gemspec +1 -1
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8effe1bf39e7072ee007a20249dd43f0c1c7bbe7
|
4
|
+
data.tar.gz: dfd99dfe5b79e17534ec7a26acd89432021183b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 847af60f50c2f0da6397b536d0a4cdeea60977cb0945f7719a34d261c4f2bb227744e60af29b70e8a472e553302366fd49fbf683ef4432a4975d416479fca80f
|
7
|
+
data.tar.gz: c63cfb012a7f94aed6bb136e3069fa161e0c1e395e664d221a5e1127c4fbeca88de9573f0f6a19798c1cd20903eb5c7a60c28557faa202f4c56e4acaaa6142a0
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Xapian::Indexer
|
2
2
|
|
3
|
-
|
3
|
+
This gem provides basic infrastructure for indexing HTML documents over HTTP into a Xapian database.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -86,7 +86,7 @@ The following example shows how to create a spider and index some resources:
|
|
86
86
|
|
87
87
|
## License
|
88
88
|
|
89
|
-
|
89
|
+
This code is dual licensed under the MIT license and GPLv3 license.
|
90
90
|
|
91
91
|
Copyright, 2015, by [Samuel G. D. Williams](http://www.codeotaku.com/samuel-williams).
|
92
92
|
|
data/lib/xapian/indexer.rb
CHANGED
@@ -1,7 +1,28 @@
|
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
1
20
|
|
2
|
-
|
3
|
-
|
4
|
-
|
21
|
+
require_relative 'indexer/extensions/uri'
|
22
|
+
|
23
|
+
require_relative 'indexer/version'
|
24
|
+
require_relative 'indexer/resource'
|
25
|
+
require_relative 'indexer/spider'
|
5
26
|
|
6
27
|
module Xapian
|
7
28
|
module Indexer
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require 'uri'
|
22
|
+
|
23
|
+
class URI::Generic
|
24
|
+
# This change allows you to merge relative URLs which otherwise isn't possible.
|
25
|
+
def merge0(oth)
|
26
|
+
oth = parser.send(:convert_to_uri, oth)
|
27
|
+
|
28
|
+
if self.absolute? && oth.absolute?
|
29
|
+
#raise BadURIError,
|
30
|
+
# "both URI are absolute"
|
31
|
+
# hmm... should return oth for usability?
|
32
|
+
return oth, oth
|
33
|
+
end
|
34
|
+
|
35
|
+
if self.absolute?
|
36
|
+
return self.dup, oth
|
37
|
+
else
|
38
|
+
return oth, oth
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -1,17 +1,22 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
3
|
-
# This program is free software: you can redistribute it and/or modify
|
4
|
-
# it under the terms of the GNU General Public License as published by
|
5
|
-
# the Free Software Foundation, either version 3 of the License, or
|
6
|
-
# (at your option) any later version.
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
7
2
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
12
9
|
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
15
20
|
|
16
21
|
require 'nokogiri'
|
17
22
|
|
@@ -1,20 +1,25 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
3
|
-
# This program is free software: you can redistribute it and/or modify
|
4
|
-
# it under the terms of the GNU General Public License as published by
|
5
|
-
# the Free Software Foundation, either version 3 of the License, or
|
6
|
-
# (at your option) any later version.
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
7
2
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
12
9
|
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
15
20
|
|
16
21
|
require 'net/http'
|
17
|
-
|
22
|
+
require_relative '../version'
|
18
23
|
|
19
24
|
module Xapian
|
20
25
|
module Indexer
|
@@ -1,17 +1,22 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
3
|
-
# This program is free software: you can redistribute it and/or modify
|
4
|
-
# it under the terms of the GNU General Public License as published by
|
5
|
-
# the Free Software Foundation, either version 3 of the License, or
|
6
|
-
# (at your option) any later version.
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
7
2
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
12
9
|
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
15
20
|
|
16
21
|
require 'digest/md5'
|
17
22
|
require 'yaml'
|
@@ -1,17 +1,22 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
3
|
-
# This program is free software: you can redistribute it and/or modify
|
4
|
-
# it under the terms of the GNU General Public License as published by
|
5
|
-
# the Free Software Foundation, either version 3 of the License, or
|
6
|
-
# (at your option) any later version.
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
7
2
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
12
9
|
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
15
20
|
|
16
21
|
require 'xapian'
|
17
22
|
require 'set'
|
@@ -1,20 +1,25 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
3
|
-
# This program is free software: you can redistribute it and/or modify
|
4
|
-
# it under the terms of the GNU General Public License as published by
|
5
|
-
# the Free Software Foundation, either version 3 of the License, or
|
6
|
-
# (at your option) any later version.
|
1
|
+
# Copyright, 2015, by Samuel G. D. Williams. <http://www.codeotaku.com>
|
7
2
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
12
9
|
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
15
20
|
|
16
21
|
module Xapian
|
17
22
|
module Indexer
|
18
|
-
VERSION = "1.2.19.
|
23
|
+
VERSION = "1.2.19.2"
|
19
24
|
end
|
20
25
|
end
|
data/xapian-indexer.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
|
-
spec.add_dependency 'xapian-core'
|
20
|
+
spec.add_dependency 'xapian-core', '~> 1.2.19.1'
|
21
21
|
spec.add_dependency 'nokogiri'
|
22
22
|
|
23
23
|
spec.add_development_dependency "bundler", "~> 1.6"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.19.
|
4
|
+
version: 1.2.19.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Samuel Williams
|
@@ -14,16 +14,16 @@ dependencies:
|
|
14
14
|
name: xapian-core
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.2.19.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.2.19.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,6 +78,7 @@ files:
|
|
78
78
|
- README.md
|
79
79
|
- Rakefile
|
80
80
|
- lib/xapian/indexer.rb
|
81
|
+
- lib/xapian/indexer/extensions/uri.rb
|
81
82
|
- lib/xapian/indexer/extractors/html.rb
|
82
83
|
- lib/xapian/indexer/loaders/http.rb
|
83
84
|
- lib/xapian/indexer/resource.rb
|