xapian-indexer 1.2.3.2 → 1.2.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,6 +20,9 @@ module Xapian
20
20
  module Extractors
21
21
  # Represents a resource that will be indexed
22
22
  class HTML
23
+ NBSP = Nokogiri::HTML(" ").text
24
+ WHITESPACE = /(\s|#{NBSP})+/
25
+
23
26
  def initialize(options = {})
24
27
  @options = options
25
28
 
@@ -40,7 +43,7 @@ module Xapian
40
43
  first_paragraph = html.search("p").first
41
44
 
42
45
  if first_paragraph
43
- result[:description] = first_paragraph.inner_text
46
+ result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ")
44
47
  end
45
48
  end
46
49
 
@@ -76,15 +79,15 @@ module Xapian
76
79
  title_tag = html.at('html/head/title')
77
80
  h1_tag = html.search('h1').first
78
81
  if title_tag
79
- result[:title] = title_tag.inner_text
82
+ result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ")
80
83
  elsif h1_tag
81
- result[:title] = h1_tag.inner_text
84
+ result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ")
82
85
  end
83
86
 
84
87
  # Extract keywords
85
88
  meta_keywords = html.css("meta[name='keyword']").first
86
89
  if meta_keywords
87
- result[:keywords] = meta_keywords['content']
90
+ result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ")
88
91
  end
89
92
 
90
93
  # Remove junk elements from the html
@@ -99,7 +102,8 @@ module Xapian
99
102
  body = html.at('html/body')
100
103
 
101
104
  if body
102
- result[:content] = body.inner_text.gsub(/\s+/, " ")
105
+ # We also convert NBSP characters to inner space.
106
+ result[:content] = body.inner_text.gsub(WHITESPACE, " ")
103
107
  end
104
108
 
105
109
  return result
@@ -19,7 +19,7 @@ module Xapian
19
19
  MAJOR = 1
20
20
  MINOR = 2
21
21
  TINY = 3
22
- REV = 2
22
+ REV = 4
23
23
 
24
24
  STRING = [MAJOR, MINOR, TINY, REV].join('.')
25
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xapian-indexer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 71
4
+ hash: 75
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 2
9
9
  - 3
10
- - 2
11
- version: 1.2.3.2
10
+ - 4
11
+ version: 1.2.3.4
12
12
  platform: ruby
13
13
  authors:
14
14
  - Samuel Williams
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-12-22 00:00:00 +13:00
19
+ date: 2011-01-24 00:00:00 +13:00
20
20
  default_executable:
21
21
  dependencies:
22
22
  - !ruby/object:Gem::Dependency
@@ -33,6 +33,20 @@ dependencies:
33
33
  version: "0"
34
34
  type: :runtime
35
35
  version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 3
45
+ segments:
46
+ - 0
47
+ version: "0"
48
+ type: :runtime
49
+ version_requirements: *id002
36
50
  description:
37
51
  email: samuel.williams@oriontransfer.co.nz
38
52
  executables: []