scrappy 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +11 -6
- data/lib/scrappy/agent/map_reduce.rb +4 -2
- data/lib/scrappy/extractor/fragment.rb +21 -19
- data/lib/scrappy/extractor/selectors/new_uri.rb +3 -1
- data/lib/scrappy/learning/optimizer.rb +80 -53
- data/lib/scrappy/server/admin.rb +4 -3
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -71,17 +71,22 @@ module Scrappy
|
|
71
71
|
else
|
72
72
|
[]
|
73
73
|
end
|
74
|
+
|
75
|
+
# Recently created URIs are not followed
|
76
|
+
nofollow = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:NewUri") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
|
77
|
+
pages -= nofollow
|
78
|
+
uris -= nofollow
|
74
79
|
|
75
80
|
items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
|
76
81
|
uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
|
77
|
-
uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
|
82
|
+
uniq.select { |item| !RDF::ID.bnode?(item[:uri]) }
|
78
83
|
|
79
|
-
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
84
|
+
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" if !queue or !(queue.history + queue.items).include?(item) } if options.debug
|
80
85
|
|
81
|
-
if queue
|
82
|
-
triples += process items
|
83
|
-
else
|
86
|
+
if queue
|
84
87
|
items.each { |item| queue.push_unless_done item }
|
88
|
+
else
|
89
|
+
triples += process items
|
85
90
|
end
|
86
91
|
|
87
92
|
triples unless options.dump
|
@@ -153,7 +158,7 @@ module Scrappy
|
|
153
158
|
end
|
154
159
|
|
155
160
|
def clean triples
|
156
|
-
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
|
161
|
+
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page'), ID('sc:NewUri')].include?(o) }
|
157
162
|
end
|
158
163
|
|
159
164
|
# Do the extraction using RDF repository
|
@@ -6,6 +6,8 @@ module MapReduce
|
|
6
6
|
class Queue
|
7
7
|
include MonitorMixin
|
8
8
|
|
9
|
+
attr_reader :history, :items
|
10
|
+
|
9
11
|
def initialize
|
10
12
|
super
|
11
13
|
@items = []
|
@@ -17,7 +19,7 @@ module MapReduce
|
|
17
19
|
item = nil
|
18
20
|
synchronize do
|
19
21
|
item = @items.shift
|
20
|
-
@history << item
|
22
|
+
@history << item if item
|
21
23
|
if @items.empty?
|
22
24
|
yield item if (block_given? and item)
|
23
25
|
yielded = true
|
@@ -36,7 +38,7 @@ module MapReduce
|
|
36
38
|
end
|
37
39
|
|
38
40
|
def push_unless_done value
|
39
|
-
synchronize { @items << value
|
41
|
+
synchronize { @items << value if !@history.include?(value) and !@items.include?(value) }
|
40
42
|
end
|
41
43
|
|
42
44
|
def empty?
|
@@ -2,20 +2,19 @@ module Sc
|
|
2
2
|
class Fragment
|
3
3
|
include RDF::NodeProxy
|
4
4
|
|
5
|
-
# Extracts data out of a document and returns an RDF::Graph
|
6
|
-
def extract_graph options={}
|
7
|
-
graph = RDF::Graph.new
|
8
|
-
extract(options).each { |node| graph << node }
|
9
|
-
graph
|
10
|
-
end
|
11
|
-
|
12
5
|
# Extracts data out of a document and returns an array of nodes
|
13
6
|
def extract options={}
|
7
|
+
all_mappings(options).map { |mapping| mapping[:node] }
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns all mappings of a fragment by
|
11
|
+
# recursively processing all submappings.
|
12
|
+
def all_mappings options={}
|
14
13
|
# Extracts all the mappings and any subfragment
|
15
|
-
mappings(options).map do |
|
16
|
-
node =
|
17
|
-
subfragments =
|
18
|
-
doc =
|
14
|
+
mappings(options).map do |mapping|
|
15
|
+
node = mapping[:node]
|
16
|
+
subfragments = mapping[:subfragments]
|
17
|
+
doc = mapping[:doc]
|
19
18
|
|
20
19
|
# Process subfragments
|
21
20
|
consistent = true
|
@@ -23,18 +22,19 @@ module Sc
|
|
23
22
|
# Get subfragment object
|
24
23
|
subfragment = subfragment.proxy Node('sc:Fragment')
|
25
24
|
|
26
|
-
#
|
27
|
-
|
25
|
+
# Add triples from submappings
|
26
|
+
submappings = subfragment.all_mappings(options.merge(:doc=>doc))
|
28
27
|
|
29
28
|
# Add relations
|
30
|
-
|
29
|
+
submappings.each do |submapping|
|
30
|
+
subnode = submapping[:node]
|
31
31
|
node.graph << subnode if subnode.is_a?(RDF::Node)
|
32
32
|
subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
|
33
33
|
end
|
34
34
|
|
35
35
|
# Check consistency
|
36
|
-
consistent = false if subfragment.sc::min_cardinality.first and
|
37
|
-
consistent = false if subfragment.sc::max_cardinality.first and
|
36
|
+
consistent = false if subfragment.sc::min_cardinality.first and submappings.size < subfragment.sc::min_cardinality.first.to_i
|
37
|
+
consistent = false if subfragment.sc::max_cardinality.first and submappings.size > subfragment.sc::max_cardinality.first.to_i
|
38
38
|
end
|
39
39
|
|
40
40
|
# Skip the node if it has inconsistent relations
|
@@ -42,11 +42,12 @@ module Sc
|
|
42
42
|
# violate the constraint sc:min_cardinality = 1
|
43
43
|
next if !consistent
|
44
44
|
|
45
|
-
node
|
45
|
+
{ :node=>node, :subfragments=>subfragments, :doc=>doc }
|
46
46
|
end.compact
|
47
47
|
end
|
48
48
|
|
49
|
-
# Returns
|
49
|
+
# Returns the mappings between this fragment
|
50
|
+
# and the RDF nodes it matches
|
50
51
|
def mappings options
|
51
52
|
# Identify the fragment's mappings
|
52
53
|
docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
|
@@ -66,7 +67,7 @@ module Sc
|
|
66
67
|
value = doc[:value].to_s.strip
|
67
68
|
if options[:referenceable]
|
68
69
|
node.rdf::value = value
|
69
|
-
node.rdf::type
|
70
|
+
node.rdf::type += [Node('rdf:Literal')]
|
70
71
|
node
|
71
72
|
else
|
72
73
|
value
|
@@ -117,6 +118,7 @@ module Sc
|
|
117
118
|
node.graph << uri_node
|
118
119
|
node.sc::uri = uri_node
|
119
120
|
end
|
121
|
+
node.rdf::type += [Node("sc:NewUri")] if d[:nofollow]
|
120
122
|
|
121
123
|
node
|
122
124
|
end.first
|
@@ -13,6 +13,8 @@ module Sc
|
|
13
13
|
prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
|
14
14
|
suffix = sc::suffix.first.to_s
|
15
15
|
|
16
|
+
nofollow = (sc::follow.first != "false")
|
17
|
+
|
16
18
|
contents.map do |content, attribute|
|
17
19
|
new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
|
18
20
|
"#{content}#{suffix}"
|
@@ -29,7 +31,7 @@ module Sc
|
|
29
31
|
"#{prefix}#{variable}#{suffix}"
|
30
32
|
end
|
31
33
|
|
32
|
-
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
|
34
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute, :nofollow=>nofollow }
|
33
35
|
end
|
34
36
|
end
|
35
37
|
end
|
@@ -2,10 +2,7 @@ module Scrappy
|
|
2
2
|
module Optimizer
|
3
3
|
# Iterates through a knowledge base and tries to merge and generalize
|
4
4
|
# selectors whenever the output of the resulting kb is the same
|
5
|
-
def
|
6
|
-
# Get the output only once
|
7
|
-
output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
|
8
|
-
|
5
|
+
def optimize_patterns kb, sample
|
9
6
|
# Build an array of fragments
|
10
7
|
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
11
8
|
fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
|
@@ -13,89 +10,119 @@ module Scrappy
|
|
13
10
|
# Parse the document
|
14
11
|
doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
|
15
12
|
|
16
|
-
|
17
|
-
|
18
|
-
end until !changed
|
13
|
+
# Optimize the fragment
|
14
|
+
fragments = optimize fragments, :docs=>[doc]
|
19
15
|
|
20
16
|
graph = RDF::Graph.new
|
21
17
|
fragments.each { |fragment| graph << fragment }
|
22
|
-
|
23
|
-
puts graph.serialize(:yarf)
|
24
|
-
exit
|
25
18
|
|
26
19
|
graph
|
27
20
|
end
|
28
21
|
|
29
22
|
protected
|
30
|
-
# Tries to optimize a set of fragments
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
# Tries to optimize a set of fragments
|
24
|
+
def optimize fragments, options
|
25
|
+
# Tries to iterate until no changes are made
|
26
|
+
@tried = []
|
27
|
+
new_fragments = fragments.map{ |f| f.proxy(Node('sc:Fragment')) }
|
28
|
+
begin
|
29
|
+
fragments = new_fragments
|
30
|
+
new_fragments = optimize_once fragments, options
|
31
|
+
end until fragments == new_fragments
|
32
|
+
fragments
|
33
|
+
end
|
34
|
+
|
35
|
+
# Tries to perform one optimization of two fragments out of a set of fragments
|
36
|
+
def optimize_once fragments, options
|
37
|
+
docs = options[:docs]
|
34
38
|
fragments.each do |fragment1|
|
35
39
|
fragments.each do |fragment2|
|
36
40
|
next if fragment1 == fragment2
|
37
|
-
|
41
|
+
# Won't get gain if the fragment does not produce the same kind of RDF resource
|
42
|
+
next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
43
|
+
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
44
|
+
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
45
|
+
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
46
|
+
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
47
|
+
|
48
|
+
next if @tried.include?([fragment1, fragment2])
|
49
|
+
next if @tried.include?([fragment2, fragment1])
|
50
|
+
|
51
|
+
@tried << [fragment1, fragment2]
|
52
|
+
|
53
|
+
# Get mappings without mixing fragments
|
54
|
+
old_mappings = []
|
55
|
+
docs.each do |doc|
|
56
|
+
old_mappings += fragment1.all_mappings(:doc=>doc)
|
57
|
+
old_mappings += fragment2.all_mappings(:doc=>doc)
|
58
|
+
end
|
59
|
+
old_docs = old_mappings.map { |mapping| mapping[:doc] }
|
60
|
+
|
61
|
+
# Get mixed fragment
|
62
|
+
new_fragment = mix(fragment1, fragment2, options)
|
38
63
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
64
|
+
# Get new mappings
|
65
|
+
new_mappings = []
|
66
|
+
docs.each { |doc| new_mappings += new_fragment.mappings(:doc=>doc) }
|
67
|
+
new_docs = new_mappings.map { |mapping| mapping[:doc] }
|
68
|
+
|
69
|
+
# Optimize subfragments
|
70
|
+
subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
|
71
|
+
subfragments.each { |subfragment| new_fragment.graph << subfragment }
|
72
|
+
new_fragment.sc::subfragment = subfragments.map &:node
|
73
|
+
|
74
|
+
# End if the new fragment returns the same results
|
75
|
+
if true
|
76
|
+
return fragments - [fragment1] - [fragment2] + [new_fragment]
|
42
77
|
end
|
43
78
|
end
|
44
79
|
end
|
45
|
-
|
80
|
+
fragments
|
46
81
|
end
|
47
82
|
|
48
|
-
def
|
49
|
-
|
50
|
-
return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
51
|
-
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
52
|
-
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
53
|
-
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
54
|
-
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
83
|
+
def mix fragment1, fragment2, options
|
84
|
+
docs = options[:docs]
|
55
85
|
|
56
86
|
# Build new fragment
|
57
|
-
new_fragment = Node(nil)
|
58
|
-
new_fragment.rdf::type =
|
87
|
+
new_fragment = Node(nil).proxy(Node('sc:Fragment'))
|
88
|
+
new_fragment.rdf::type = Node('sc:Fragment')
|
59
89
|
new_fragment.sc::type = fragment1.sc::type
|
60
90
|
new_fragment.sc::relation = fragment1.sc::relation
|
61
91
|
new_fragment.sc::superclass = fragment1.sc::superclass
|
62
92
|
new_fragment.sc::sameas = fragment1.sc::sameas
|
93
|
+
|
94
|
+
# If fragments share the same parent, cardinality has to increase
|
95
|
+
# Otherwise, they might map the same subdocument, so cardinality
|
96
|
+
# limits are made more general.
|
97
|
+
if fragment1.sc::superfragment.first and fragment1.sc::superfragment == fragment2.sc::superfragment
|
98
|
+
new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
|
99
|
+
new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
|
100
|
+
else
|
101
|
+
new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
|
102
|
+
new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
|
103
|
+
end
|
63
104
|
|
64
105
|
# sc:selector
|
65
|
-
selector =
|
106
|
+
selector = generalize(fragment1.sc::selector + fragment2.sc::selector)
|
66
107
|
new_fragment.graph << selector
|
67
108
|
new_fragment.sc::selector = selector
|
68
109
|
|
69
110
|
# sc:identifier
|
70
111
|
if fragment1.sc::identifier.first
|
71
|
-
selector =
|
112
|
+
selector = generalize(fragment1.sc::identifier + fragment2.sc::identifier)
|
72
113
|
new_fragment.graph << selector
|
73
114
|
new_fragment.sc::identifier = selector
|
74
115
|
end
|
75
|
-
|
76
|
-
#
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
sf.sc::relation.sort_by(&:to_s) == relations
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# Check new output
|
86
|
-
separate_output1 = fragment1.extract_graph :doc=>doc
|
87
|
-
separate_output2 = fragment2.extract_graph :doc=>doc
|
88
|
-
separate_output = separate_output1.merge separate_output2
|
89
|
-
new_output = new_fragment.proxy.extract_graph :doc=>doc
|
90
|
-
|
91
|
-
# Check if the output with the new fragment is a subset of the full output
|
92
|
-
# and if the output of the fragments alone is a subset of the output of the new
|
93
|
-
# fragment. This way we ensure the output is the same without using all the
|
94
|
-
# fragments that are available in the knowledge base.
|
95
|
-
new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
|
116
|
+
|
117
|
+
# All new nodes are expected to be inconsistent after performing
|
118
|
+
# subfragments' extractions. Otherwise, if new nodes are consistent, it means
|
119
|
+
# the output from the mixed fragment is different from the separate fragments
|
120
|
+
# and therefore the generalization has failed, so no mixed fragment is returned
|
121
|
+
new_fragment
|
96
122
|
end
|
97
|
-
|
98
|
-
|
123
|
+
|
124
|
+
# Generalize a set of selectors
|
125
|
+
def generalize selectors
|
99
126
|
selector = Node(nil)
|
100
127
|
selector.rdf::type = Node('sc:VisualSelector')
|
101
128
|
selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
|
data/lib/scrappy/server/admin.rb
CHANGED
@@ -58,8 +58,9 @@ module Scrappy
|
|
58
58
|
# Patterns
|
59
59
|
|
60
60
|
app.get '/patterns' do
|
61
|
-
@uris = Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment'))
|
62
|
-
|
61
|
+
@uris = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
|
62
|
+
Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) ).
|
63
|
+
map { |node| node.sc::type }.flatten.map(&:to_s).sort
|
63
64
|
haml :patterns
|
64
65
|
end
|
65
66
|
|
@@ -95,7 +96,7 @@ module Scrappy
|
|
95
96
|
end
|
96
97
|
|
97
98
|
app.post '/samples/:id/optimize' do |id|
|
98
|
-
Scrappy::Kb.patterns = agent.
|
99
|
+
Scrappy::Kb.patterns = agent.optimize_patterns(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
|
99
100
|
Scrappy::App.save_patterns Scrappy::Kb.patterns
|
100
101
|
flash[:notice] = "Optimization completed"
|
101
102
|
redirect "#{settings.base_uri}/samples"
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.5"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-29}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 5
|
9
|
+
version: 0.3.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-29 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|