xapian-indexer 1.2.3.2 → 1.2.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/xapian/indexer/extractors/html.rb +9 -5
- data/lib/xapian/indexer/version.rb +1 -1
- metadata +18 -4
@@ -20,6 +20,9 @@ module Xapian
|
|
20
20
|
module Extractors
|
21
21
|
# Represents a resource that will be indexed
|
22
22
|
class HTML
|
23
|
+
NBSP = Nokogiri::HTML(" ").text
|
24
|
+
WHITESPACE = /(\s|#{NBSP})+/
|
25
|
+
|
23
26
|
def initialize(options = {})
|
24
27
|
@options = options
|
25
28
|
|
@@ -40,7 +43,7 @@ module Xapian
|
|
40
43
|
first_paragraph = html.search("p").first
|
41
44
|
|
42
45
|
if first_paragraph
|
43
|
-
result[:description] = first_paragraph.inner_text
|
46
|
+
result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ")
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
@@ -76,15 +79,15 @@ module Xapian
|
|
76
79
|
title_tag = html.at('html/head/title')
|
77
80
|
h1_tag = html.search('h1').first
|
78
81
|
if title_tag
|
79
|
-
result[:title] = title_tag.inner_text
|
82
|
+
result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ")
|
80
83
|
elsif h1_tag
|
81
|
-
result[:title] = h1_tag.inner_text
|
84
|
+
result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ")
|
82
85
|
end
|
83
86
|
|
84
87
|
# Extract keywords
|
85
88
|
meta_keywords = html.css("meta[name='keyword']").first
|
86
89
|
if meta_keywords
|
87
|
-
result[:keywords] = meta_keywords['content']
|
90
|
+
result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ")
|
88
91
|
end
|
89
92
|
|
90
93
|
# Remove junk elements from the html
|
@@ -99,7 +102,8 @@ module Xapian
|
|
99
102
|
body = html.at('html/body')
|
100
103
|
|
101
104
|
if body
|
102
|
-
|
105
|
+
# We also convert NBSP characters to inner space.
|
106
|
+
result[:content] = body.inner_text.gsub(WHITESPACE, " ")
|
103
107
|
end
|
104
108
|
|
105
109
|
return result
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 75
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 2
|
9
9
|
- 3
|
10
|
-
-
|
11
|
-
version: 1.2.3.
|
10
|
+
- 4
|
11
|
+
version: 1.2.3.4
|
12
12
|
platform: ruby
|
13
13
|
authors:
|
14
14
|
- Samuel Williams
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date:
|
19
|
+
date: 2011-01-24 00:00:00 +13:00
|
20
20
|
default_executable:
|
21
21
|
dependencies:
|
22
22
|
- !ruby/object:Gem::Dependency
|
@@ -33,6 +33,20 @@ dependencies:
|
|
33
33
|
version: "0"
|
34
34
|
type: :runtime
|
35
35
|
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: nokogiri
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
type: :runtime
|
49
|
+
version_requirements: *id002
|
36
50
|
description:
|
37
51
|
email: samuel.williams@oriontransfer.co.nz
|
38
52
|
executables: []
|