xapian-indexer 1.2.3.2 → 1.2.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,9 @@ module Xapian
20
20
  module Extractors
21
21
  # Represents a resource that will be indexed
22
22
  class HTML
23
+ NBSP = Nokogiri::HTML(" ").text
24
+ WHITESPACE = /(\s|#{NBSP})+/
25
+
23
26
  def initialize(options = {})
24
27
  @options = options
25
28
 
@@ -40,7 +43,7 @@ module Xapian
40
43
  first_paragraph = html.search("p").first
41
44
 
42
45
  if first_paragraph
43
- result[:description] = first_paragraph.inner_text
46
+ result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ")
44
47
  end
45
48
  end
46
49
 
@@ -76,15 +79,15 @@ module Xapian
76
79
  title_tag = html.at('html/head/title')
77
80
  h1_tag = html.search('h1').first
78
81
  if title_tag
79
- result[:title] = title_tag.inner_text
82
+ result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ")
80
83
  elsif h1_tag
81
- result[:title] = h1_tag.inner_text
84
+ result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ")
82
85
  end
83
86
 
84
87
  # Extract keywords
85
88
  meta_keywords = html.css("meta[name='keyword']").first
86
89
  if meta_keywords
87
- result[:keywords] = meta_keywords['content']
90
+ result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ")
88
91
  end
89
92
 
90
93
  # Remove junk elements from the html
@@ -99,7 +102,8 @@ module Xapian
99
102
  body = html.at('html/body')
100
103
 
101
104
  if body
102
- result[:content] = body.inner_text.gsub(/\s+/, " ")
105
+ # We also convert NBSP characters to inner space.
106
+ result[:content] = body.inner_text.gsub(WHITESPACE, " ")
103
107
  end
104
108
 
105
109
  return result
@@ -19,7 +19,7 @@ module Xapian
19
19
  MAJOR = 1
20
20
  MINOR = 2
21
21
  TINY = 3
22
- REV = 2
22
+ REV = 4
23
23
 
24
24
  STRING = [MAJOR, MINOR, TINY, REV].join('.')
25
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xapian-indexer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 71
4
+ hash: 75
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 2
9
9
  - 3
10
- - 2
11
- version: 1.2.3.2
10
+ - 4
11
+ version: 1.2.3.4
12
12
  platform: ruby
13
13
  authors:
14
14
  - Samuel Williams
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-12-22 00:00:00 +13:00
19
+ date: 2011-01-24 00:00:00 +13:00
20
20
  default_executable:
21
21
  dependencies:
22
22
  - !ruby/object:Gem::Dependency
@@ -33,6 +33,20 @@ dependencies:
33
33
  version: "0"
34
34
  type: :runtime
35
35
  version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 3
45
+ segments:
46
+ - 0
47
+ version: "0"
48
+ type: :runtime
49
+ version_requirements: *id002
36
50
  description:
37
51
  email: samuel.williams@oriontransfer.co.nz
38
52
  executables: []