xapian-indexer 1.2.3.2 → 1.2.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/xapian/indexer/extractors/html.rb +9 -5
- data/lib/xapian/indexer/version.rb +1 -1
- metadata +18 -4
@@ -20,6 +20,9 @@ module Xapian
|
|
20
20
|
module Extractors
|
21
21
|
# Represents a resource that will be indexed
|
22
22
|
class HTML
|
23
|
+
NBSP = Nokogiri::HTML(" ").text
|
24
|
+
WHITESPACE = /(\s|#{NBSP})+/
|
25
|
+
|
23
26
|
def initialize(options = {})
|
24
27
|
@options = options
|
25
28
|
|
@@ -40,7 +43,7 @@ module Xapian
|
|
40
43
|
first_paragraph = html.search("p").first
|
41
44
|
|
42
45
|
if first_paragraph
|
43
|
-
result[:description] = first_paragraph.inner_text
|
46
|
+
result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ")
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
@@ -76,15 +79,15 @@ module Xapian
|
|
76
79
|
title_tag = html.at('html/head/title')
|
77
80
|
h1_tag = html.search('h1').first
|
78
81
|
if title_tag
|
79
|
-
result[:title] = title_tag.inner_text
|
82
|
+
result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ")
|
80
83
|
elsif h1_tag
|
81
|
-
result[:title] = h1_tag.inner_text
|
84
|
+
result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ")
|
82
85
|
end
|
83
86
|
|
84
87
|
# Extract keywords
|
85
88
|
meta_keywords = html.css("meta[name='keyword']").first
|
86
89
|
if meta_keywords
|
87
|
-
result[:keywords] = meta_keywords['content']
|
90
|
+
result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ")
|
88
91
|
end
|
89
92
|
|
90
93
|
# Remove junk elements from the html
|
@@ -99,7 +102,8 @@ module Xapian
|
|
99
102
|
body = html.at('html/body')
|
100
103
|
|
101
104
|
if body
|
102
|
-
|
105
|
+
# We also convert NBSP characters to inner space.
|
106
|
+
result[:content] = body.inner_text.gsub(WHITESPACE, " ")
|
103
107
|
end
|
104
108
|
|
105
109
|
return result
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 75
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 2
|
9
9
|
- 3
|
10
|
-
-
|
11
|
-
version: 1.2.3.
|
10
|
+
- 4
|
11
|
+
version: 1.2.3.4
|
12
12
|
platform: ruby
|
13
13
|
authors:
|
14
14
|
- Samuel Williams
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date:
|
19
|
+
date: 2011-01-24 00:00:00 +13:00
|
20
20
|
default_executable:
|
21
21
|
dependencies:
|
22
22
|
- !ruby/object:Gem::Dependency
|
@@ -33,6 +33,20 @@ dependencies:
|
|
33
33
|
version: "0"
|
34
34
|
type: :runtime
|
35
35
|
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: nokogiri
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
type: :runtime
|
49
|
+
version_requirements: *id002
|
36
50
|
description:
|
37
51
|
email: samuel.williams@oriontransfer.co.nz
|
38
52
|
executables: []
|