scrappy 0.4.8 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ === 0.4.9 2011-11-28
2
+
3
+ * Refactorization of how encoding issues are managed
4
+
1
5
  === 0.4.8 2011-11-24
2
6
 
3
7
  * Encoding fixes
@@ -8,7 +8,6 @@ require 'ostruct'
8
8
  require 'active_support'
9
9
  require 'tmpdir'
10
10
  require 'lightrdf'
11
- require 'iconv'
12
11
 
13
12
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
14
13
 
@@ -25,5 +24,5 @@ require 'scrappy/agent/blind_agent'
25
24
  require 'scrappy/agent/agent'
26
25
 
27
26
  module Scrappy
28
- VERSION = '0.4.8'
27
+ VERSION = '0.4.9'
29
28
  end
@@ -4,7 +4,7 @@ module Scrappy
4
4
  def format node, formats, uri
5
5
  case formats.first
6
6
  when Node('sc:WikiText') then
7
- doc = Nokogiri::XML(node.to_html)
7
+ doc = Nokogiri::XML(node.to_html.clean)
8
8
  doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
9
9
  doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
10
10
  doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
@@ -26,15 +26,15 @@ module Scrappy
26
26
  doc.text.strip
27
27
  when Node('sc:Html') then
28
28
  if node.respond_to? :to_html
29
- node.to_html
29
+ node.to_html.clean
30
30
  else
31
- node.to_s
31
+ node.to_s.clean
32
32
  end
33
33
  else
34
34
  if node.respond_to? :text
35
- node.text
35
+ node.text.clean
36
36
  else
37
- node.to_s
37
+ node.to_s.clean
38
38
  end
39
39
  end
40
40
  end
@@ -65,14 +65,12 @@ module Sc
65
65
 
66
66
  # Build the object -- it can be a node or a literal
67
67
  object = if sc::type.include?(Node('rdf:Literal'))
68
- ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
69
- value = ic.iconv(doc[:value].to_s + ' ')[0..-2].gsub("\302\240"," ").strip
70
68
  if options[:referenceable]
71
- node.rdf::value = value
69
+ node.rdf::value = doc[:value]
72
70
  node.rdf::type += [Node('rdf:Literal')]
73
71
  node
74
72
  else
75
- value
73
+ doc[:value]
76
74
  end
77
75
  else
78
76
  # Add statements about the node
@@ -20,13 +20,13 @@ module Sc
20
20
  # Process selector
21
21
  # Filter method is defined in each subclass
22
22
  results = filter doc
23
-
23
+
24
24
  if sc::boolean.first=="true"
25
25
  results = results.map do |r|
26
26
  affirmations = ["yes", "true"]
27
27
  negations = ["no", "none", "false", "-", "--"]
28
- no = negations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
29
- yes = affirmations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
28
+ no = negations.include?(r[:value].downcase)
29
+ yes = affirmations.include?(r[:value].downcase)
30
30
  if no
31
31
  value = "false"
32
32
  elsif yes
@@ -41,10 +41,16 @@ module Sc
41
41
  if sc::normalize_max.first
42
42
  max = sc::normalize_max.first.to_f
43
43
  min = sc::normalize_min.first.to_f
44
- results.each { |r| r[:value] = ((r[:value].to_f-min) / (max-min)).to_s }
44
+ in_range = sc::normalize_in_range.first == "true"
45
+ results.each do |r|
46
+ r[:value] = ((r[:value].to_f-min) / (max-min)).to_s
47
+ end
48
+ if in_range
49
+ results = results.select { |r| r[:value].to_f <= 1.0 and r[:value].to_f >= 0.0 }
50
+ end
45
51
  end
46
52
  if sc::nonempty.first=="true"
47
- results = results.select{ |r| r[:value].gsub("\302\240"," ").strip!=""}
53
+ results = results.select{ |r| r[:value] != ""}
48
54
  end
49
55
 
50
56
  if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
@@ -62,7 +62,7 @@ module Sc
62
62
  end.map do |content|
63
63
  if attributes.first
64
64
  # Select node's attribute if given
65
- attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
65
+ attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute].clean, :attribute=>attribute } }
66
66
  else
67
67
  [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
68
68
  end
@@ -8,10 +8,12 @@ module Sc
8
8
  (0..-1)
9
9
  end
10
10
  patterns = sc::keyword
11
- (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
11
+ (doc[:content].search(pattern)[interval] || []).select do |node|
12
+ patterns.any? ? patterns.include?(node.text.clean.downcase) : true
13
+ end.map do |result|
12
14
  if sc::attribute.first
13
15
  # Select node's attribute if given
14
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
16
+ sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute].clean, :attribute=>attribute } }
15
17
  else
16
18
  # Select node
17
19
  [ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
@@ -1,3 +1,5 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'iconv'
1
3
  require 'open-uri'
2
4
  require 'net/http'
3
5
  require 'net/https'
@@ -19,6 +21,8 @@ module Nokogiri
19
21
  end
20
22
 
21
23
  class String
24
+ Utf8Iconv = Iconv.new('UTF-8//IGNORE', 'UTF-8')
25
+
22
26
  def wikify
23
27
  gsub(/^[a-z]|\s+[a-z]/) { |a| a.upcase }.gsub(/\s/, '')
24
28
  end
@@ -30,4 +34,7 @@ class String
30
34
  gsub(/\s+/,"_").
31
35
  downcase
32
36
  end
37
+ def clean
38
+ Utf8Iconv.iconv(self + ' ')[0..-3].gsub("\302\240"," ").strip
39
+ end
33
40
  end
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "scrappy"
5
- s.version = "0.4.8"
5
+ s.version = "0.4.9"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = "2011-11-24"
9
+ s.date = "2011-11-28"
10
10
  s.description = "RDF web scraper"
11
11
  s.email = "joseignacio.fernandez@gmail.com"
12
12
  s.executables = ["scrappy"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.8
4
+ version: 0.4.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-24 00:00:00.000000000Z
12
+ date: 2011-11-28 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
16
- requirement: &82230020 !ruby/object:Gem::Requirement
16
+ requirement: &85217640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.3.5
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *82230020
24
+ version_requirements: *85217640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: sinatra
27
- requirement: &82229760 !ruby/object:Gem::Requirement
27
+ requirement: &85217340 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.1.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *82229760
35
+ version_requirements: *85217340
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: thin
38
- requirement: &82229510 !ruby/object:Gem::Requirement
38
+ requirement: &85217090 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.2.7
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *82229510
46
+ version_requirements: *85217090
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &82229270 !ruby/object:Gem::Requirement
49
+ requirement: &85216790 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 1.4.1
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *82229270
57
+ version_requirements: *85216790
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: mechanize
60
- requirement: &82229010 !ruby/object:Gem::Requirement
60
+ requirement: &85216530 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.0
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *82229010
68
+ version_requirements: *85216530
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: lightrdf
71
- requirement: &82228740 !ruby/object:Gem::Requirement
71
+ requirement: &85216280 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 0.4.1
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *82228740
79
+ version_requirements: *85216280
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: i18n
82
- requirement: &82228500 !ruby/object:Gem::Requirement
82
+ requirement: &85215970 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: 0.4.2
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *82228500
90
+ version_requirements: *85215970
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: rest-client
93
- requirement: &82228260 !ruby/object:Gem::Requirement
93
+ requirement: &85215720 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: 1.6.1
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *82228260
101
+ version_requirements: *85215720
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &82227980 !ruby/object:Gem::Requirement
104
+ requirement: &85215430 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: 3.0.24
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *82227980
112
+ version_requirements: *85215430
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rack-flash
115
- requirement: &82227730 !ruby/object:Gem::Requirement
115
+ requirement: &85215190 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 0.1.1
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *82227730
123
+ version_requirements: *85215190
124
124
  description: RDF web scraper
125
125
  email: joseignacio.fernandez@gmail.com
126
126
  executables: