scrappy 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.3.5 2011-03-29
2
+
3
+ * Added nofollow support to NewUriSelectors
4
+
1
5
  === 0.3.4 2011-03-25
2
6
 
3
7
  * Fixed gem dependency
data/lib/scrappy.rb CHANGED
@@ -24,5 +24,5 @@ require 'scrappy/agent/blind_agent'
24
24
  require 'scrappy/agent/agent'
25
25
 
26
26
  module Scrappy
27
- VERSION = '0.3.4'
27
+ VERSION = '0.3.5'
28
28
  end
@@ -71,17 +71,22 @@ module Scrappy
71
71
  else
72
72
  []
73
73
  end
74
+
75
+ # Recently created URIs are not followed
76
+ nofollow = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:NewUri") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
77
+ pages -= nofollow
78
+ uris -= nofollow
74
79
 
75
80
  items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
76
81
  uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
77
- uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
82
+ uniq.select { |item| !RDF::ID.bnode?(item[:uri]) }
78
83
 
79
- items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
84
+ items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" if !queue or !(queue.history + queue.items).include?(item) } if options.debug
80
85
 
81
- if queue.nil?
82
- triples += process items
83
- else
86
+ if queue
84
87
  items.each { |item| queue.push_unless_done item }
88
+ else
89
+ triples += process items
85
90
  end
86
91
 
87
92
  triples unless options.dump
@@ -153,7 +158,7 @@ module Scrappy
153
158
  end
154
159
 
155
160
  def clean triples
156
- triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
161
+ triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page'), ID('sc:NewUri')].include?(o) }
157
162
  end
158
163
 
159
164
  # Do the extraction using RDF repository
@@ -6,6 +6,8 @@ module MapReduce
6
6
  class Queue
7
7
  include MonitorMixin
8
8
 
9
+ attr_reader :history, :items
10
+
9
11
  def initialize
10
12
  super
11
13
  @items = []
@@ -17,7 +19,7 @@ module MapReduce
17
19
  item = nil
18
20
  synchronize do
19
21
  item = @items.shift
20
- @history << item
22
+ @history << item if item
21
23
  if @items.empty?
22
24
  yield item if (block_given? and item)
23
25
  yielded = true
@@ -36,7 +38,7 @@ module MapReduce
36
38
  end
37
39
 
38
40
  def push_unless_done value
39
- synchronize { @items << value unless @history.include?(value) }
41
+ synchronize { @items << value if !@history.include?(value) and !@items.include?(value) }
40
42
  end
41
43
 
42
44
  def empty?
@@ -2,20 +2,19 @@ module Sc
2
2
  class Fragment
3
3
  include RDF::NodeProxy
4
4
 
5
- # Extracts data out of a document and returns an RDF::Graph
6
- def extract_graph options={}
7
- graph = RDF::Graph.new
8
- extract(options).each { |node| graph << node }
9
- graph
10
- end
11
-
12
5
  # Extracts data out of a document and returns an array of nodes
13
6
  def extract options={}
7
+ all_mappings(options).map { |mapping| mapping[:node] }
8
+ end
9
+
10
+ # Returns all mappings of a fragment by
11
+ # recursively processing all submappings.
12
+ def all_mappings options={}
14
13
  # Extracts all the mappings and any subfragment
15
- mappings(options).map do |result|
16
- node = result[:node]
17
- subfragments = result[:subfragments]
18
- doc = result[:doc]
14
+ mappings(options).map do |mapping|
15
+ node = mapping[:node]
16
+ subfragments = mapping[:subfragments]
17
+ doc = mapping[:doc]
19
18
 
20
19
  # Process subfragments
21
20
  consistent = true
@@ -23,18 +22,19 @@ module Sc
23
22
  # Get subfragment object
24
23
  subfragment = subfragment.proxy Node('sc:Fragment')
25
24
 
26
- # Extract data from the subfragment
27
- subnodes = subfragment.extract(options.merge(:doc=>doc))
25
+ # Add triples from submappings
26
+ submappings = subfragment.all_mappings(options.merge(:doc=>doc))
28
27
 
29
28
  # Add relations
30
- subnodes.each do |subnode|
29
+ submappings.each do |submapping|
30
+ subnode = submapping[:node]
31
31
  node.graph << subnode if subnode.is_a?(RDF::Node)
32
32
  subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
33
33
  end
34
34
 
35
35
  # Check consistency
36
- consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
37
- consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
36
+ consistent = false if subfragment.sc::min_cardinality.first and submappings.size < subfragment.sc::min_cardinality.first.to_i
37
+ consistent = false if subfragment.sc::max_cardinality.first and submappings.size > subfragment.sc::max_cardinality.first.to_i
38
38
  end
39
39
 
40
40
  # Skip the node if it has inconsistent relations
@@ -42,11 +42,12 @@ module Sc
42
42
  # violate the constraint sc:min_cardinality = 1
43
43
  next if !consistent
44
44
 
45
- node
45
+ { :node=>node, :subfragments=>subfragments, :doc=>doc }
46
46
  end.compact
47
47
  end
48
48
 
49
- # Returns all the mappings between this fragment and RDF nodes
49
+ # Returns the mappings between this fragment
50
+ # and the RDF nodes it matches
50
51
  def mappings options
51
52
  # Identify the fragment's mappings
52
53
  docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
@@ -66,7 +67,7 @@ module Sc
66
67
  value = doc[:value].to_s.strip
67
68
  if options[:referenceable]
68
69
  node.rdf::value = value
69
- node.rdf::type = Node('rdf:Literal')
70
+ node.rdf::type += [Node('rdf:Literal')]
70
71
  node
71
72
  else
72
73
  value
@@ -117,6 +118,7 @@ module Sc
117
118
  node.graph << uri_node
118
119
  node.sc::uri = uri_node
119
120
  end
121
+ node.rdf::type += [Node("sc:NewUri")] if d[:nofollow]
120
122
 
121
123
  node
122
124
  end.first
@@ -13,6 +13,8 @@ module Sc
13
13
  prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
14
14
  suffix = sc::suffix.first.to_s
15
15
 
16
+ nofollow = (sc::follow.first != "false")
17
+
16
18
  contents.map do |content, attribute|
17
19
  new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
18
20
  "#{content}#{suffix}"
@@ -29,7 +31,7 @@ module Sc
29
31
  "#{prefix}#{variable}#{suffix}"
30
32
  end
31
33
 
32
- { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
34
+ { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute, :nofollow=>nofollow }
33
35
  end
34
36
  end
35
37
  end
@@ -2,10 +2,7 @@ module Scrappy
2
2
  module Optimizer
3
3
  # Iterates through a knowledge base and tries to merge and generalize
4
4
  # selectors whenever the output of the resulting kb is the same
5
- def optimize kb, sample
6
- # Get the output only once
7
- output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
8
-
5
+ def optimize_patterns kb, sample
9
6
  # Build an array of fragments
10
7
  root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
11
8
  fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
@@ -13,89 +10,119 @@ module Scrappy
13
10
  # Parse the document
14
11
  doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
15
12
 
16
- begin
17
- changed, fragments = optimize_once fragments, doc, output
18
- end until !changed
13
+ # Optimize the fragment
14
+ fragments = optimize fragments, :docs=>[doc]
19
15
 
20
16
  graph = RDF::Graph.new
21
17
  fragments.each { |fragment| graph << fragment }
22
-
23
- puts graph.serialize(:yarf)
24
- exit
25
18
 
26
19
  graph
27
20
  end
28
21
 
29
22
  protected
30
- # Tries to optimize a set of fragments.
31
- # Returns true if there were changes, false otherwise,
32
- # and the new fragments as the second array element
33
- def optimize_once fragments, doc, output
23
+ # Tries to optimize a set of fragments
24
+ def optimize fragments, options
25
+ # Tries to iterate until no changes are made
26
+ @tried = []
27
+ new_fragments = fragments.map{ |f| f.proxy(Node('sc:Fragment')) }
28
+ begin
29
+ fragments = new_fragments
30
+ new_fragments = optimize_once fragments, options
31
+ end until fragments == new_fragments
32
+ fragments
33
+ end
34
+
35
+ # Tries to perform one optimization of two fragments out of a set of fragments
36
+ def optimize_once fragments, options
37
+ docs = options[:docs]
34
38
  fragments.each do |fragment1|
35
39
  fragments.each do |fragment2|
36
40
  next if fragment1 == fragment2
37
- new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
41
+ # Won't get gain if the fragment does not produce the same kind of RDF resource
42
+ next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
43
+ !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
44
+ !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
45
+ !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
46
+ fragment1.sc::identifier.size != fragment2.sc::identifier.size
47
+
48
+ next if @tried.include?([fragment1, fragment2])
49
+ next if @tried.include?([fragment2, fragment1])
50
+
51
+ @tried << [fragment1, fragment2]
52
+
53
+ # Get mappings without mixing fragments
54
+ old_mappings = []
55
+ docs.each do |doc|
56
+ old_mappings += fragment1.all_mappings(:doc=>doc)
57
+ old_mappings += fragment2.all_mappings(:doc=>doc)
58
+ end
59
+ old_docs = old_mappings.map { |mapping| mapping[:doc] }
60
+
61
+ # Get mixed fragment
62
+ new_fragment = mix(fragment1, fragment2, options)
38
63
 
39
- # End if a new fragment was created
40
- if new_fragment
41
- return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
64
+ # Get new mappings
65
+ new_mappings = []
66
+ docs.each { |doc| new_mappings += new_fragment.mappings(:doc=>doc) }
67
+ new_docs = new_mappings.map { |mapping| mapping[:doc] }
68
+
69
+ # Optimize subfragments
70
+ subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
71
+ subfragments.each { |subfragment| new_fragment.graph << subfragment }
72
+ new_fragment.sc::subfragment = subfragments.map &:node
73
+
74
+ # End if the new fragment returns the same results
75
+ if true
76
+ return fragments - [fragment1] - [fragment2] + [new_fragment]
42
77
  end
43
78
  end
44
79
  end
45
- [false, fragments]
80
+ fragments
46
81
  end
47
82
 
48
- def mix_if_gain fragment1, fragment2, doc, output
49
- # Won't get gain if the fragment does not produce the same kind of RDF resource
50
- return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
51
- !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
52
- !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
53
- !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
54
- fragment1.sc::identifier.size != fragment2.sc::identifier.size
83
+ def mix fragment1, fragment2, options
84
+ docs = options[:docs]
55
85
 
56
86
  # Build new fragment
57
- new_fragment = Node(nil)
58
- new_fragment.rdf::type = fragment1.rdf::type
87
+ new_fragment = Node(nil).proxy(Node('sc:Fragment'))
88
+ new_fragment.rdf::type = Node('sc:Fragment')
59
89
  new_fragment.sc::type = fragment1.sc::type
60
90
  new_fragment.sc::relation = fragment1.sc::relation
61
91
  new_fragment.sc::superclass = fragment1.sc::superclass
62
92
  new_fragment.sc::sameas = fragment1.sc::sameas
93
+
94
+ # If fragments share the same parent, cardinality has to increase
95
+ # Otherwise, they might map the same subdocument, so cardinality
96
+ # limits are made more general.
97
+ if fragment1.sc::superfragment.first and fragment1.sc::superfragment == fragment2.sc::superfragment
98
+ new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
99
+ new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
100
+ else
101
+ new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
102
+ new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
103
+ end
63
104
 
64
105
  # sc:selector
65
- selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
106
+ selector = generalize(fragment1.sc::selector + fragment2.sc::selector)
66
107
  new_fragment.graph << selector
67
108
  new_fragment.sc::selector = selector
68
109
 
69
110
  # sc:identifier
70
111
  if fragment1.sc::identifier.first
71
- selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
112
+ selector = generalize(fragment1.sc::identifier + fragment2.sc::identifier)
72
113
  new_fragment.graph << selector
73
114
  new_fragment.sc::identifier = selector
74
115
  end
75
-
76
- # sc:subfragment
77
- all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
78
- all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
79
- subfragments = all_subfragments.select do |sf|
80
- sf.sc::type.sort_by(&:to_s) == types and
81
- sf.sc::relation.sort_by(&:to_s) == relations
82
- end
83
- end
84
-
85
- # Check new output
86
- separate_output1 = fragment1.extract_graph :doc=>doc
87
- separate_output2 = fragment2.extract_graph :doc=>doc
88
- separate_output = separate_output1.merge separate_output2
89
- new_output = new_fragment.proxy.extract_graph :doc=>doc
90
-
91
- # Check if the output with the new fragment is a subset of the full output
92
- # and if the output of the fragments alone is a subset of the output of the new
93
- # fragment. This way we ensure the output is the same without using all the
94
- # fragments that are available in the knowledge base.
95
- new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
116
+
117
+ # All new nodes are expected to be inconsistent after performing
118
+ # subfragments' extractions. Otherwise, if new nodes are consistent, it means
119
+ # the output from the mixed fragment is different from the separate fragments
120
+ # and therefore the generalization has failed, so no mixed fragment is returned
121
+ new_fragment
96
122
  end
97
-
98
- def generalize_selectors selectors
123
+
124
+ # Generalize a set of selectors
125
+ def generalize selectors
99
126
  selector = Node(nil)
100
127
  selector.rdf::type = Node('sc:VisualSelector')
101
128
  selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
@@ -58,8 +58,9 @@ module Scrappy
58
58
  # Patterns
59
59
 
60
60
  app.get '/patterns' do
61
- @uris = Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')).
62
- map { |node| node.sc::type }.flatten.map(&:to_s).sort
61
+ @uris = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
62
+ Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) ).
63
+ map { |node| node.sc::type }.flatten.map(&:to_s).sort
63
64
  haml :patterns
64
65
  end
65
66
 
@@ -95,7 +96,7 @@ module Scrappy
95
96
  end
96
97
 
97
98
  app.post '/samples/:id/optimize' do |id|
98
- Scrappy::Kb.patterns = agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
99
+ Scrappy::Kb.patterns = agent.optimize_patterns(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
99
100
  Scrappy::App.save_patterns Scrappy::Kb.patterns
100
101
  flash[:notice] = "Optimization completed"
101
102
  redirect "#{settings.base_uri}/samples"
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.3.4"
5
+ s.version = "0.3.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-03-25}
9
+ s.date = %q{2011-03-29}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 4
9
- version: 0.3.4
8
+ - 5
9
+ version: 0.3.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-25 00:00:00 +01:00
17
+ date: 2011-03-29 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency