scrappy 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.3.5 2011-03-29
2
+
3
+ * Added nofollow support to NewUriSelectors
4
+
1
5
  === 0.3.4 2011-03-25
2
6
 
3
7
  * Fixed gem dependency
data/lib/scrappy.rb CHANGED
@@ -24,5 +24,5 @@ require 'scrappy/agent/blind_agent'
24
24
  require 'scrappy/agent/agent'
25
25
 
26
26
  module Scrappy
27
- VERSION = '0.3.4'
27
+ VERSION = '0.3.5'
28
28
  end
@@ -71,17 +71,22 @@ module Scrappy
71
71
  else
72
72
  []
73
73
  end
74
+
75
+ # Recently created URIs are not followed
76
+ nofollow = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:NewUri") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
77
+ pages -= nofollow
78
+ uris -= nofollow
74
79
 
75
80
  items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
76
81
  uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
77
- uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
82
+ uniq.select { |item| !RDF::ID.bnode?(item[:uri]) }
78
83
 
79
- items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
84
+ items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" if !queue or !(queue.history + queue.items).include?(item) } if options.debug
80
85
 
81
- if queue.nil?
82
- triples += process items
83
- else
86
+ if queue
84
87
  items.each { |item| queue.push_unless_done item }
88
+ else
89
+ triples += process items
85
90
  end
86
91
 
87
92
  triples unless options.dump
@@ -153,7 +158,7 @@ module Scrappy
153
158
  end
154
159
 
155
160
  def clean triples
156
- triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
161
+ triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page'), ID('sc:NewUri')].include?(o) }
157
162
  end
158
163
 
159
164
  # Do the extraction using RDF repository
@@ -6,6 +6,8 @@ module MapReduce
6
6
  class Queue
7
7
  include MonitorMixin
8
8
 
9
+ attr_reader :history, :items
10
+
9
11
  def initialize
10
12
  super
11
13
  @items = []
@@ -17,7 +19,7 @@ module MapReduce
17
19
  item = nil
18
20
  synchronize do
19
21
  item = @items.shift
20
- @history << item
22
+ @history << item if item
21
23
  if @items.empty?
22
24
  yield item if (block_given? and item)
23
25
  yielded = true
@@ -36,7 +38,7 @@ module MapReduce
36
38
  end
37
39
 
38
40
  def push_unless_done value
39
- synchronize { @items << value unless @history.include?(value) }
41
+ synchronize { @items << value if !@history.include?(value) and !@items.include?(value) }
40
42
  end
41
43
 
42
44
  def empty?
@@ -2,20 +2,19 @@ module Sc
2
2
  class Fragment
3
3
  include RDF::NodeProxy
4
4
 
5
- # Extracts data out of a document and returns an RDF::Graph
6
- def extract_graph options={}
7
- graph = RDF::Graph.new
8
- extract(options).each { |node| graph << node }
9
- graph
10
- end
11
-
12
5
  # Extracts data out of a document and returns an array of nodes
13
6
  def extract options={}
7
+ all_mappings(options).map { |mapping| mapping[:node] }
8
+ end
9
+
10
+ # Returns all mappings of a fragment by
11
+ # recursively processing all submappings.
12
+ def all_mappings options={}
14
13
  # Extracts all the mappings and any subfragment
15
- mappings(options).map do |result|
16
- node = result[:node]
17
- subfragments = result[:subfragments]
18
- doc = result[:doc]
14
+ mappings(options).map do |mapping|
15
+ node = mapping[:node]
16
+ subfragments = mapping[:subfragments]
17
+ doc = mapping[:doc]
19
18
 
20
19
  # Process subfragments
21
20
  consistent = true
@@ -23,18 +22,19 @@ module Sc
23
22
  # Get subfragment object
24
23
  subfragment = subfragment.proxy Node('sc:Fragment')
25
24
 
26
- # Extract data from the subfragment
27
- subnodes = subfragment.extract(options.merge(:doc=>doc))
25
+ # Add triples from submappings
26
+ submappings = subfragment.all_mappings(options.merge(:doc=>doc))
28
27
 
29
28
  # Add relations
30
- subnodes.each do |subnode|
29
+ submappings.each do |submapping|
30
+ subnode = submapping[:node]
31
31
  node.graph << subnode if subnode.is_a?(RDF::Node)
32
32
  subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
33
33
  end
34
34
 
35
35
  # Check consistency
36
- consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
37
- consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
36
+ consistent = false if subfragment.sc::min_cardinality.first and submappings.size < subfragment.sc::min_cardinality.first.to_i
37
+ consistent = false if subfragment.sc::max_cardinality.first and submappings.size > subfragment.sc::max_cardinality.first.to_i
38
38
  end
39
39
 
40
40
  # Skip the node if it has inconsistent relations
@@ -42,11 +42,12 @@ module Sc
42
42
  # violate the constraint sc:min_cardinality = 1
43
43
  next if !consistent
44
44
 
45
- node
45
+ { :node=>node, :subfragments=>subfragments, :doc=>doc }
46
46
  end.compact
47
47
  end
48
48
 
49
- # Returns all the mappings between this fragment and RDF nodes
49
+ # Returns the mappings between this fragment
50
+ # and the RDF nodes it matches
50
51
  def mappings options
51
52
  # Identify the fragment's mappings
52
53
  docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
@@ -66,7 +67,7 @@ module Sc
66
67
  value = doc[:value].to_s.strip
67
68
  if options[:referenceable]
68
69
  node.rdf::value = value
69
- node.rdf::type = Node('rdf:Literal')
70
+ node.rdf::type += [Node('rdf:Literal')]
70
71
  node
71
72
  else
72
73
  value
@@ -117,6 +118,7 @@ module Sc
117
118
  node.graph << uri_node
118
119
  node.sc::uri = uri_node
119
120
  end
121
+ node.rdf::type += [Node("sc:NewUri")] if d[:nofollow]
120
122
 
121
123
  node
122
124
  end.first
@@ -13,6 +13,8 @@ module Sc
13
13
  prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
14
14
  suffix = sc::suffix.first.to_s
15
15
 
16
+ nofollow = (sc::follow.first != "false")
17
+
16
18
  contents.map do |content, attribute|
17
19
  new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
18
20
  "#{content}#{suffix}"
@@ -29,7 +31,7 @@ module Sc
29
31
  "#{prefix}#{variable}#{suffix}"
30
32
  end
31
33
 
32
- { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
34
+ { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute, :nofollow=>nofollow }
33
35
  end
34
36
  end
35
37
  end
@@ -2,10 +2,7 @@ module Scrappy
2
2
  module Optimizer
3
3
  # Iterates through a knowledge base and tries to merge and generalize
4
4
  # selectors whenever the output of the resulting kb is the same
5
- def optimize kb, sample
6
- # Get the output only once
7
- output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
8
-
5
+ def optimize_patterns kb, sample
9
6
  # Build an array of fragments
10
7
  root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
11
8
  fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
@@ -13,89 +10,119 @@ module Scrappy
13
10
  # Parse the document
14
11
  doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
15
12
 
16
- begin
17
- changed, fragments = optimize_once fragments, doc, output
18
- end until !changed
13
+ # Optimize the fragment
14
+ fragments = optimize fragments, :docs=>[doc]
19
15
 
20
16
  graph = RDF::Graph.new
21
17
  fragments.each { |fragment| graph << fragment }
22
-
23
- puts graph.serialize(:yarf)
24
- exit
25
18
 
26
19
  graph
27
20
  end
28
21
 
29
22
  protected
30
- # Tries to optimize a set of fragments.
31
- # Returns true if there were changes, false otherwise,
32
- # and the new fragments as the second array element
33
- def optimize_once fragments, doc, output
23
+ # Tries to optimize a set of fragments
24
+ def optimize fragments, options
25
+ # Tries to iterate until no changes are made
26
+ @tried = []
27
+ new_fragments = fragments.map{ |f| f.proxy(Node('sc:Fragment')) }
28
+ begin
29
+ fragments = new_fragments
30
+ new_fragments = optimize_once fragments, options
31
+ end until fragments == new_fragments
32
+ fragments
33
+ end
34
+
35
+ # Tries to perform one optimization of two fragments out of a set of fragments
36
+ def optimize_once fragments, options
37
+ docs = options[:docs]
34
38
  fragments.each do |fragment1|
35
39
  fragments.each do |fragment2|
36
40
  next if fragment1 == fragment2
37
- new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
41
+ # Won't get gain if the fragment does not produce the same kind of RDF resource
42
+ next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
43
+ !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
44
+ !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
45
+ !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
46
+ fragment1.sc::identifier.size != fragment2.sc::identifier.size
47
+
48
+ next if @tried.include?([fragment1, fragment2])
49
+ next if @tried.include?([fragment2, fragment1])
50
+
51
+ @tried << [fragment1, fragment2]
52
+
53
+ # Get mappings without mixing fragments
54
+ old_mappings = []
55
+ docs.each do |doc|
56
+ old_mappings += fragment1.all_mappings(:doc=>doc)
57
+ old_mappings += fragment2.all_mappings(:doc=>doc)
58
+ end
59
+ old_docs = old_mappings.map { |mapping| mapping[:doc] }
60
+
61
+ # Get mixed fragment
62
+ new_fragment = mix(fragment1, fragment2, options)
38
63
 
39
- # End if a new fragment was created
40
- if new_fragment
41
- return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
64
+ # Get new mappings
65
+ new_mappings = []
66
+ docs.each { |doc| new_mappings += new_fragment.mappings(:doc=>doc) }
67
+ new_docs = new_mappings.map { |mapping| mapping[:doc] }
68
+
69
+ # Optimize subfragments
70
+ subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
71
+ subfragments.each { |subfragment| new_fragment.graph << subfragment }
72
+ new_fragment.sc::subfragment = subfragments.map &:node
73
+
74
+ # End if the new fragment returns the same results
75
+ if true
76
+ return fragments - [fragment1] - [fragment2] + [new_fragment]
42
77
  end
43
78
  end
44
79
  end
45
- [false, fragments]
80
+ fragments
46
81
  end
47
82
 
48
- def mix_if_gain fragment1, fragment2, doc, output
49
- # Won't get gain if the fragment does not produce the same kind of RDF resource
50
- return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
51
- !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
52
- !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
53
- !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
54
- fragment1.sc::identifier.size != fragment2.sc::identifier.size
83
+ def mix fragment1, fragment2, options
84
+ docs = options[:docs]
55
85
 
56
86
  # Build new fragment
57
- new_fragment = Node(nil)
58
- new_fragment.rdf::type = fragment1.rdf::type
87
+ new_fragment = Node(nil).proxy(Node('sc:Fragment'))
88
+ new_fragment.rdf::type = Node('sc:Fragment')
59
89
  new_fragment.sc::type = fragment1.sc::type
60
90
  new_fragment.sc::relation = fragment1.sc::relation
61
91
  new_fragment.sc::superclass = fragment1.sc::superclass
62
92
  new_fragment.sc::sameas = fragment1.sc::sameas
93
+
94
+ # If fragments share the same parent, cardinality has to increase
95
+ # Otherwise, they might map the same subdocument, so cardinality
96
+ # limits are made more general.
97
+ if fragment1.sc::superfragment.first and fragment1.sc::superfragment == fragment2.sc::superfragment
98
+ new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
99
+ new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
100
+ else
101
+ new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
102
+ new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
103
+ end
63
104
 
64
105
  # sc:selector
65
- selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
106
+ selector = generalize(fragment1.sc::selector + fragment2.sc::selector)
66
107
  new_fragment.graph << selector
67
108
  new_fragment.sc::selector = selector
68
109
 
69
110
  # sc:identifier
70
111
  if fragment1.sc::identifier.first
71
- selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
112
+ selector = generalize(fragment1.sc::identifier + fragment2.sc::identifier)
72
113
  new_fragment.graph << selector
73
114
  new_fragment.sc::identifier = selector
74
115
  end
75
-
76
- # sc:subfragment
77
- all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
78
- all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
79
- subfragments = all_subfragments.select do |sf|
80
- sf.sc::type.sort_by(&:to_s) == types and
81
- sf.sc::relation.sort_by(&:to_s) == relations
82
- end
83
- end
84
-
85
- # Check new output
86
- separate_output1 = fragment1.extract_graph :doc=>doc
87
- separate_output2 = fragment2.extract_graph :doc=>doc
88
- separate_output = separate_output1.merge separate_output2
89
- new_output = new_fragment.proxy.extract_graph :doc=>doc
90
-
91
- # Check if the output with the new fragment is a subset of the full output
92
- # and if the output of the fragments alone is a subset of the output of the new
93
- # fragment. This way we ensure the output is the same without using all the
94
- # fragments that are available in the knowledge base.
95
- new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
116
+
117
+ # All new nodes are expected to be inconsistent after performing
118
+ # subfragments' extractions. Otherwise, if new nodes are consistent, it means
119
+ # the output from the mixed fragment is different from the separate fragments
120
+ # and therefore the generalization has failed, so no mixed fragment is returned
121
+ new_fragment
96
122
  end
97
-
98
- def generalize_selectors selectors
123
+
124
+ # Generalize a set of selectors
125
+ def generalize selectors
99
126
  selector = Node(nil)
100
127
  selector.rdf::type = Node('sc:VisualSelector')
101
128
  selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
@@ -58,8 +58,9 @@ module Scrappy
58
58
  # Patterns
59
59
 
60
60
  app.get '/patterns' do
61
- @uris = Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')).
62
- map { |node| node.sc::type }.flatten.map(&:to_s).sort
61
+ @uris = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
62
+ Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) ).
63
+ map { |node| node.sc::type }.flatten.map(&:to_s).sort
63
64
  haml :patterns
64
65
  end
65
66
 
@@ -95,7 +96,7 @@ module Scrappy
95
96
  end
96
97
 
97
98
  app.post '/samples/:id/optimize' do |id|
98
- Scrappy::Kb.patterns = agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
99
+ Scrappy::Kb.patterns = agent.optimize_patterns(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
99
100
  Scrappy::App.save_patterns Scrappy::Kb.patterns
100
101
  flash[:notice] = "Optimization completed"
101
102
  redirect "#{settings.base_uri}/samples"
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.3.4"
5
+ s.version = "0.3.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-03-25}
9
+ s.date = %q{2011-03-29}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 4
9
- version: 0.3.4
8
+ - 5
9
+ version: 0.3.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-25 00:00:00 +01:00
17
+ date: 2011-03-29 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency