scrappy 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +11 -6
- data/lib/scrappy/agent/map_reduce.rb +4 -2
- data/lib/scrappy/extractor/fragment.rb +21 -19
- data/lib/scrappy/extractor/selectors/new_uri.rb +3 -1
- data/lib/scrappy/learning/optimizer.rb +80 -53
- data/lib/scrappy/server/admin.rb +4 -3
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -71,17 +71,22 @@ module Scrappy
|
|
71
71
|
else
|
72
72
|
[]
|
73
73
|
end
|
74
|
+
|
75
|
+
# Recently created URIs are not followed
|
76
|
+
nofollow = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:NewUri") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
|
77
|
+
pages -= nofollow
|
78
|
+
uris -= nofollow
|
74
79
|
|
75
80
|
items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
|
76
81
|
uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
|
77
|
-
uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
|
82
|
+
uniq.select { |item| !RDF::ID.bnode?(item[:uri]) }
|
78
83
|
|
79
|
-
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
84
|
+
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" if !queue or !(queue.history + queue.items).include?(item) } if options.debug
|
80
85
|
|
81
|
-
if queue
|
82
|
-
triples += process items
|
83
|
-
else
|
86
|
+
if queue
|
84
87
|
items.each { |item| queue.push_unless_done item }
|
88
|
+
else
|
89
|
+
triples += process items
|
85
90
|
end
|
86
91
|
|
87
92
|
triples unless options.dump
|
@@ -153,7 +158,7 @@ module Scrappy
|
|
153
158
|
end
|
154
159
|
|
155
160
|
def clean triples
|
156
|
-
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
|
161
|
+
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page'), ID('sc:NewUri')].include?(o) }
|
157
162
|
end
|
158
163
|
|
159
164
|
# Do the extraction using RDF repository
|
@@ -6,6 +6,8 @@ module MapReduce
|
|
6
6
|
class Queue
|
7
7
|
include MonitorMixin
|
8
8
|
|
9
|
+
attr_reader :history, :items
|
10
|
+
|
9
11
|
def initialize
|
10
12
|
super
|
11
13
|
@items = []
|
@@ -17,7 +19,7 @@ module MapReduce
|
|
17
19
|
item = nil
|
18
20
|
synchronize do
|
19
21
|
item = @items.shift
|
20
|
-
@history << item
|
22
|
+
@history << item if item
|
21
23
|
if @items.empty?
|
22
24
|
yield item if (block_given? and item)
|
23
25
|
yielded = true
|
@@ -36,7 +38,7 @@ module MapReduce
|
|
36
38
|
end
|
37
39
|
|
38
40
|
def push_unless_done value
|
39
|
-
synchronize { @items << value
|
41
|
+
synchronize { @items << value if !@history.include?(value) and !@items.include?(value) }
|
40
42
|
end
|
41
43
|
|
42
44
|
def empty?
|
@@ -2,20 +2,19 @@ module Sc
|
|
2
2
|
class Fragment
|
3
3
|
include RDF::NodeProxy
|
4
4
|
|
5
|
-
# Extracts data out of a document and returns an RDF::Graph
|
6
|
-
def extract_graph options={}
|
7
|
-
graph = RDF::Graph.new
|
8
|
-
extract(options).each { |node| graph << node }
|
9
|
-
graph
|
10
|
-
end
|
11
|
-
|
12
5
|
# Extracts data out of a document and returns an array of nodes
|
13
6
|
def extract options={}
|
7
|
+
all_mappings(options).map { |mapping| mapping[:node] }
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns all mappings of a fragment by
|
11
|
+
# recursively processing all submappings.
|
12
|
+
def all_mappings options={}
|
14
13
|
# Extracts all the mappings and any subfragment
|
15
|
-
mappings(options).map do |
|
16
|
-
node =
|
17
|
-
subfragments =
|
18
|
-
doc =
|
14
|
+
mappings(options).map do |mapping|
|
15
|
+
node = mapping[:node]
|
16
|
+
subfragments = mapping[:subfragments]
|
17
|
+
doc = mapping[:doc]
|
19
18
|
|
20
19
|
# Process subfragments
|
21
20
|
consistent = true
|
@@ -23,18 +22,19 @@ module Sc
|
|
23
22
|
# Get subfragment object
|
24
23
|
subfragment = subfragment.proxy Node('sc:Fragment')
|
25
24
|
|
26
|
-
#
|
27
|
-
|
25
|
+
# Add triples from submappings
|
26
|
+
submappings = subfragment.all_mappings(options.merge(:doc=>doc))
|
28
27
|
|
29
28
|
# Add relations
|
30
|
-
|
29
|
+
submappings.each do |submapping|
|
30
|
+
subnode = submapping[:node]
|
31
31
|
node.graph << subnode if subnode.is_a?(RDF::Node)
|
32
32
|
subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
|
33
33
|
end
|
34
34
|
|
35
35
|
# Check consistency
|
36
|
-
consistent = false if subfragment.sc::min_cardinality.first and
|
37
|
-
consistent = false if subfragment.sc::max_cardinality.first and
|
36
|
+
consistent = false if subfragment.sc::min_cardinality.first and submappings.size < subfragment.sc::min_cardinality.first.to_i
|
37
|
+
consistent = false if subfragment.sc::max_cardinality.first and submappings.size > subfragment.sc::max_cardinality.first.to_i
|
38
38
|
end
|
39
39
|
|
40
40
|
# Skip the node if it has inconsistent relations
|
@@ -42,11 +42,12 @@ module Sc
|
|
42
42
|
# violate the constraint sc:min_cardinality = 1
|
43
43
|
next if !consistent
|
44
44
|
|
45
|
-
node
|
45
|
+
{ :node=>node, :subfragments=>subfragments, :doc=>doc }
|
46
46
|
end.compact
|
47
47
|
end
|
48
48
|
|
49
|
-
# Returns
|
49
|
+
# Returns the mappings between this fragment
|
50
|
+
# and the RDF nodes it matches
|
50
51
|
def mappings options
|
51
52
|
# Identify the fragment's mappings
|
52
53
|
docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
|
@@ -66,7 +67,7 @@ module Sc
|
|
66
67
|
value = doc[:value].to_s.strip
|
67
68
|
if options[:referenceable]
|
68
69
|
node.rdf::value = value
|
69
|
-
node.rdf::type
|
70
|
+
node.rdf::type += [Node('rdf:Literal')]
|
70
71
|
node
|
71
72
|
else
|
72
73
|
value
|
@@ -117,6 +118,7 @@ module Sc
|
|
117
118
|
node.graph << uri_node
|
118
119
|
node.sc::uri = uri_node
|
119
120
|
end
|
121
|
+
node.rdf::type += [Node("sc:NewUri")] if d[:nofollow]
|
120
122
|
|
121
123
|
node
|
122
124
|
end.first
|
@@ -13,6 +13,8 @@ module Sc
|
|
13
13
|
prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
|
14
14
|
suffix = sc::suffix.first.to_s
|
15
15
|
|
16
|
+
nofollow = (sc::follow.first != "false")
|
17
|
+
|
16
18
|
contents.map do |content, attribute|
|
17
19
|
new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
|
18
20
|
"#{content}#{suffix}"
|
@@ -29,7 +31,7 @@ module Sc
|
|
29
31
|
"#{prefix}#{variable}#{suffix}"
|
30
32
|
end
|
31
33
|
|
32
|
-
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
|
34
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute, :nofollow=>nofollow }
|
33
35
|
end
|
34
36
|
end
|
35
37
|
end
|
@@ -2,10 +2,7 @@ module Scrappy
|
|
2
2
|
module Optimizer
|
3
3
|
# Iterates through a knowledge base and tries to merge and generalize
|
4
4
|
# selectors whenever the output of the resulting kb is the same
|
5
|
-
def
|
6
|
-
# Get the output only once
|
7
|
-
output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
|
8
|
-
|
5
|
+
def optimize_patterns kb, sample
|
9
6
|
# Build an array of fragments
|
10
7
|
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
11
8
|
fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
|
@@ -13,89 +10,119 @@ module Scrappy
|
|
13
10
|
# Parse the document
|
14
11
|
doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
|
15
12
|
|
16
|
-
|
17
|
-
|
18
|
-
end until !changed
|
13
|
+
# Optimize the fragment
|
14
|
+
fragments = optimize fragments, :docs=>[doc]
|
19
15
|
|
20
16
|
graph = RDF::Graph.new
|
21
17
|
fragments.each { |fragment| graph << fragment }
|
22
|
-
|
23
|
-
puts graph.serialize(:yarf)
|
24
|
-
exit
|
25
18
|
|
26
19
|
graph
|
27
20
|
end
|
28
21
|
|
29
22
|
protected
|
30
|
-
# Tries to optimize a set of fragments
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
# Tries to optimize a set of fragments
|
24
|
+
def optimize fragments, options
|
25
|
+
# Tries to iterate until no changes are made
|
26
|
+
@tried = []
|
27
|
+
new_fragments = fragments.map{ |f| f.proxy(Node('sc:Fragment')) }
|
28
|
+
begin
|
29
|
+
fragments = new_fragments
|
30
|
+
new_fragments = optimize_once fragments, options
|
31
|
+
end until fragments == new_fragments
|
32
|
+
fragments
|
33
|
+
end
|
34
|
+
|
35
|
+
# Tries to perform one optimization of two fragments out of a set of fragments
|
36
|
+
def optimize_once fragments, options
|
37
|
+
docs = options[:docs]
|
34
38
|
fragments.each do |fragment1|
|
35
39
|
fragments.each do |fragment2|
|
36
40
|
next if fragment1 == fragment2
|
37
|
-
|
41
|
+
# Won't get gain if the fragment does not produce the same kind of RDF resource
|
42
|
+
next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
43
|
+
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
44
|
+
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
45
|
+
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
46
|
+
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
47
|
+
|
48
|
+
next if @tried.include?([fragment1, fragment2])
|
49
|
+
next if @tried.include?([fragment2, fragment1])
|
50
|
+
|
51
|
+
@tried << [fragment1, fragment2]
|
52
|
+
|
53
|
+
# Get mappings without mixing fragments
|
54
|
+
old_mappings = []
|
55
|
+
docs.each do |doc|
|
56
|
+
old_mappings += fragment1.all_mappings(:doc=>doc)
|
57
|
+
old_mappings += fragment2.all_mappings(:doc=>doc)
|
58
|
+
end
|
59
|
+
old_docs = old_mappings.map { |mapping| mapping[:doc] }
|
60
|
+
|
61
|
+
# Get mixed fragment
|
62
|
+
new_fragment = mix(fragment1, fragment2, options)
|
38
63
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
64
|
+
# Get new mappings
|
65
|
+
new_mappings = []
|
66
|
+
docs.each { |doc| new_mappings += new_fragment.mappings(:doc=>doc) }
|
67
|
+
new_docs = new_mappings.map { |mapping| mapping[:doc] }
|
68
|
+
|
69
|
+
# Optimize subfragments
|
70
|
+
subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
|
71
|
+
subfragments.each { |subfragment| new_fragment.graph << subfragment }
|
72
|
+
new_fragment.sc::subfragment = subfragments.map &:node
|
73
|
+
|
74
|
+
# End if the new fragment returns the same results
|
75
|
+
if true
|
76
|
+
return fragments - [fragment1] - [fragment2] + [new_fragment]
|
42
77
|
end
|
43
78
|
end
|
44
79
|
end
|
45
|
-
|
80
|
+
fragments
|
46
81
|
end
|
47
82
|
|
48
|
-
def
|
49
|
-
|
50
|
-
return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
51
|
-
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
52
|
-
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
53
|
-
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
54
|
-
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
83
|
+
def mix fragment1, fragment2, options
|
84
|
+
docs = options[:docs]
|
55
85
|
|
56
86
|
# Build new fragment
|
57
|
-
new_fragment = Node(nil)
|
58
|
-
new_fragment.rdf::type =
|
87
|
+
new_fragment = Node(nil).proxy(Node('sc:Fragment'))
|
88
|
+
new_fragment.rdf::type = Node('sc:Fragment')
|
59
89
|
new_fragment.sc::type = fragment1.sc::type
|
60
90
|
new_fragment.sc::relation = fragment1.sc::relation
|
61
91
|
new_fragment.sc::superclass = fragment1.sc::superclass
|
62
92
|
new_fragment.sc::sameas = fragment1.sc::sameas
|
93
|
+
|
94
|
+
# If fragments share the same parent, cardinality has to increase
|
95
|
+
# Otherwise, they might map the same subdocument, so cardinality
|
96
|
+
# limits are made more general.
|
97
|
+
if fragment1.sc::superfragment.first and fragment1.sc::superfragment == fragment2.sc::superfragment
|
98
|
+
new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
|
99
|
+
new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
|
100
|
+
else
|
101
|
+
new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
|
102
|
+
new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
|
103
|
+
end
|
63
104
|
|
64
105
|
# sc:selector
|
65
|
-
selector =
|
106
|
+
selector = generalize(fragment1.sc::selector + fragment2.sc::selector)
|
66
107
|
new_fragment.graph << selector
|
67
108
|
new_fragment.sc::selector = selector
|
68
109
|
|
69
110
|
# sc:identifier
|
70
111
|
if fragment1.sc::identifier.first
|
71
|
-
selector =
|
112
|
+
selector = generalize(fragment1.sc::identifier + fragment2.sc::identifier)
|
72
113
|
new_fragment.graph << selector
|
73
114
|
new_fragment.sc::identifier = selector
|
74
115
|
end
|
75
|
-
|
76
|
-
#
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
sf.sc::relation.sort_by(&:to_s) == relations
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# Check new output
|
86
|
-
separate_output1 = fragment1.extract_graph :doc=>doc
|
87
|
-
separate_output2 = fragment2.extract_graph :doc=>doc
|
88
|
-
separate_output = separate_output1.merge separate_output2
|
89
|
-
new_output = new_fragment.proxy.extract_graph :doc=>doc
|
90
|
-
|
91
|
-
# Check if the output with the new fragment is a subset of the full output
|
92
|
-
# and if the output of the fragments alone is a subset of the output of the new
|
93
|
-
# fragment. This way we ensure the output is the same without using all the
|
94
|
-
# fragments that are available in the knowledge base.
|
95
|
-
new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
|
116
|
+
|
117
|
+
# All new nodes are expected to be inconsistent after performing
|
118
|
+
# subfragments' extractions. Otherwise, if new nodes are consistent, it means
|
119
|
+
# the output from the mixed fragment is different from the separate fragments
|
120
|
+
# and therefore the generalization has failed, so no mixed fragment is returned
|
121
|
+
new_fragment
|
96
122
|
end
|
97
|
-
|
98
|
-
|
123
|
+
|
124
|
+
# Generalize a set of selectors
|
125
|
+
def generalize selectors
|
99
126
|
selector = Node(nil)
|
100
127
|
selector.rdf::type = Node('sc:VisualSelector')
|
101
128
|
selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
|
data/lib/scrappy/server/admin.rb
CHANGED
@@ -58,8 +58,9 @@ module Scrappy
|
|
58
58
|
# Patterns
|
59
59
|
|
60
60
|
app.get '/patterns' do
|
61
|
-
@uris = Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment'))
|
62
|
-
|
61
|
+
@uris = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
|
62
|
+
Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) ).
|
63
|
+
map { |node| node.sc::type }.flatten.map(&:to_s).sort
|
63
64
|
haml :patterns
|
64
65
|
end
|
65
66
|
|
@@ -95,7 +96,7 @@ module Scrappy
|
|
95
96
|
end
|
96
97
|
|
97
98
|
app.post '/samples/:id/optimize' do |id|
|
98
|
-
Scrappy::Kb.patterns = agent.
|
99
|
+
Scrappy::Kb.patterns = agent.optimize_patterns(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
|
99
100
|
Scrappy::App.save_patterns Scrappy::Kb.patterns
|
100
101
|
flash[:notice] = "Optimization completed"
|
101
102
|
redirect "#{settings.base_uri}/samples"
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.5"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-29}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 5
|
9
|
+
version: 0.3.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-29 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|