scrappy 0.4.8 → 0.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ === 0.4.9 2011-11-28
2
+
3
+ * Refactorization of how encoding issues are managed
4
+
1
5
  === 0.4.8 2011-11-24
2
6
 
3
7
  * Encoding fixes
@@ -8,7 +8,6 @@ require 'ostruct'
8
8
  require 'active_support'
9
9
  require 'tmpdir'
10
10
  require 'lightrdf'
11
- require 'iconv'
12
11
 
13
12
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
14
13
 
@@ -25,5 +24,5 @@ require 'scrappy/agent/blind_agent'
25
24
  require 'scrappy/agent/agent'
26
25
 
27
26
  module Scrappy
28
- VERSION = '0.4.8'
27
+ VERSION = '0.4.9'
29
28
  end
@@ -4,7 +4,7 @@ module Scrappy
4
4
  def format node, formats, uri
5
5
  case formats.first
6
6
  when Node('sc:WikiText') then
7
- doc = Nokogiri::XML(node.to_html)
7
+ doc = Nokogiri::XML(node.to_html.clean)
8
8
  doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
9
9
  doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
10
10
  doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
@@ -26,15 +26,15 @@ module Scrappy
26
26
  doc.text.strip
27
27
  when Node('sc:Html') then
28
28
  if node.respond_to? :to_html
29
- node.to_html
29
+ node.to_html.clean
30
30
  else
31
- node.to_s
31
+ node.to_s.clean
32
32
  end
33
33
  else
34
34
  if node.respond_to? :text
35
- node.text
35
+ node.text.clean
36
36
  else
37
- node.to_s
37
+ node.to_s.clean
38
38
  end
39
39
  end
40
40
  end
@@ -65,14 +65,12 @@ module Sc
65
65
 
66
66
  # Build the object -- it can be a node or a literal
67
67
  object = if sc::type.include?(Node('rdf:Literal'))
68
- ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
69
- value = ic.iconv(doc[:value].to_s + ' ')[0..-2].gsub("\302\240"," ").strip
70
68
  if options[:referenceable]
71
- node.rdf::value = value
69
+ node.rdf::value = doc[:value]
72
70
  node.rdf::type += [Node('rdf:Literal')]
73
71
  node
74
72
  else
75
- value
73
+ doc[:value]
76
74
  end
77
75
  else
78
76
  # Add statements about the node
@@ -20,13 +20,13 @@ module Sc
20
20
  # Process selector
21
21
  # Filter method is defined in each subclass
22
22
  results = filter doc
23
-
23
+
24
24
  if sc::boolean.first=="true"
25
25
  results = results.map do |r|
26
26
  affirmations = ["yes", "true"]
27
27
  negations = ["no", "none", "false", "-", "--"]
28
- no = negations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
29
- yes = affirmations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
28
+ no = negations.include?(r[:value].downcase)
29
+ yes = affirmations.include?(r[:value].downcase)
30
30
  if no
31
31
  value = "false"
32
32
  elsif yes
@@ -41,10 +41,16 @@ module Sc
41
41
  if sc::normalize_max.first
42
42
  max = sc::normalize_max.first.to_f
43
43
  min = sc::normalize_min.first.to_f
44
- results.each { |r| r[:value] = ((r[:value].to_f-min) / (max-min)).to_s }
44
+ in_range = sc::normalize_in_range.first == "true"
45
+ results.each do |r|
46
+ r[:value] = ((r[:value].to_f-min) / (max-min)).to_s
47
+ end
48
+ if in_range
49
+ results = results.select { |r| r[:value].to_f <= 1.0 and r[:value].to_f >= 0.0 }
50
+ end
45
51
  end
46
52
  if sc::nonempty.first=="true"
47
- results = results.select{ |r| r[:value].gsub("\302\240"," ").strip!=""}
53
+ results = results.select{ |r| r[:value] != ""}
48
54
  end
49
55
 
50
56
  if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
@@ -62,7 +62,7 @@ module Sc
62
62
  end.map do |content|
63
63
  if attributes.first
64
64
  # Select node's attribute if given
65
- attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
65
+ attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute].clean, :attribute=>attribute } }
66
66
  else
67
67
  [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
68
68
  end
@@ -8,10 +8,12 @@ module Sc
8
8
  (0..-1)
9
9
  end
10
10
  patterns = sc::keyword
11
- (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
11
+ (doc[:content].search(pattern)[interval] || []).select do |node|
12
+ patterns.any? ? patterns.include?(node.text.clean.downcase) : true
13
+ end.map do |result|
12
14
  if sc::attribute.first
13
15
  # Select node's attribute if given
14
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
16
+ sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute].clean, :attribute=>attribute } }
15
17
  else
16
18
  # Select node
17
19
  [ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
@@ -1,3 +1,5 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'iconv'
1
3
  require 'open-uri'
2
4
  require 'net/http'
3
5
  require 'net/https'
@@ -19,6 +21,8 @@ module Nokogiri
19
21
  end
20
22
 
21
23
  class String
24
+ Utf8Iconv = Iconv.new('UTF-8//IGNORE', 'UTF-8')
25
+
22
26
  def wikify
23
27
  gsub(/^[a-z]|\s+[a-z]/) { |a| a.upcase }.gsub(/\s/, '')
24
28
  end
@@ -30,4 +34,7 @@ class String
30
34
  gsub(/\s+/,"_").
31
35
  downcase
32
36
  end
37
+ def clean
38
+ Utf8Iconv.iconv(self + ' ')[0..-3].gsub("\302\240"," ").strip
39
+ end
33
40
  end
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "scrappy"
5
- s.version = "0.4.8"
5
+ s.version = "0.4.9"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = "2011-11-24"
9
+ s.date = "2011-11-28"
10
10
  s.description = "RDF web scraper"
11
11
  s.email = "joseignacio.fernandez@gmail.com"
12
12
  s.executables = ["scrappy"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.8
4
+ version: 0.4.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-24 00:00:00.000000000Z
12
+ date: 2011-11-28 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
16
- requirement: &82230020 !ruby/object:Gem::Requirement
16
+ requirement: &85217640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.3.5
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *82230020
24
+ version_requirements: *85217640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: sinatra
27
- requirement: &82229760 !ruby/object:Gem::Requirement
27
+ requirement: &85217340 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.1.2
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *82229760
35
+ version_requirements: *85217340
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: thin
38
- requirement: &82229510 !ruby/object:Gem::Requirement
38
+ requirement: &85217090 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.2.7
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *82229510
46
+ version_requirements: *85217090
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &82229270 !ruby/object:Gem::Requirement
49
+ requirement: &85216790 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 1.4.1
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *82229270
57
+ version_requirements: *85216790
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: mechanize
60
- requirement: &82229010 !ruby/object:Gem::Requirement
60
+ requirement: &85216530 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.0
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *82229010
68
+ version_requirements: *85216530
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: lightrdf
71
- requirement: &82228740 !ruby/object:Gem::Requirement
71
+ requirement: &85216280 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 0.4.1
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *82228740
79
+ version_requirements: *85216280
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: i18n
82
- requirement: &82228500 !ruby/object:Gem::Requirement
82
+ requirement: &85215970 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: 0.4.2
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *82228500
90
+ version_requirements: *85215970
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: rest-client
93
- requirement: &82228260 !ruby/object:Gem::Requirement
93
+ requirement: &85215720 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: 1.6.1
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *82228260
101
+ version_requirements: *85215720
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &82227980 !ruby/object:Gem::Requirement
104
+ requirement: &85215430 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: 3.0.24
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *82227980
112
+ version_requirements: *85215430
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rack-flash
115
- requirement: &82227730 !ruby/object:Gem::Requirement
115
+ requirement: &85215190 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 0.1.1
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *82227730
123
+ version_requirements: *85215190
124
124
  description: RDF web scraper
125
125
  email: joseignacio.fernandez@gmail.com
126
126
  executables: