scrappy 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Manifest +3 -2
- data/bin/scrappy +6 -5
- data/{kb → extractors}/elmundo.yarf +0 -0
- data/lib/scrappy/agent/agent.rb +1 -0
- data/lib/scrappy/extractor/extractor.rb +14 -64
- data/lib/scrappy/extractor/fragment.rb +131 -70
- data/lib/scrappy/extractor/selectors/new_uri.rb +15 -12
- data/lib/scrappy/extractor/selectors/root.rb +1 -1
- data/lib/scrappy/extractor/selectors/slice.rb +1 -1
- data/lib/scrappy/extractor/selectors/visual.rb +41 -20
- data/lib/scrappy/extractor/selectors/xpath.rb +1 -1
- data/lib/scrappy/learning/optimizer.rb +121 -0
- data/lib/scrappy/{trainer → learning}/trainer.rb +14 -21
- data/lib/scrappy/server/admin.rb +7 -1
- data/lib/scrappy/support.rb +24 -0
- data/lib/scrappy.rb +3 -2
- data/public/javascripts/annotator.js +22 -10
- data/public/stylesheets/application.css +1 -1
- data/scrappy.gemspec +7 -7
- data/views/samples.haml +2 -0
- metadata +11 -33
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -3,7 +3,7 @@ Manifest
|
|
3
3
|
README.rdoc
|
4
4
|
Rakefile
|
5
5
|
bin/scrappy
|
6
|
-
|
6
|
+
extractors/elmundo.yarf
|
7
7
|
lib/scrappy.rb
|
8
8
|
lib/scrappy/agent/agent.rb
|
9
9
|
lib/scrappy/agent/blind_agent.rb
|
@@ -24,13 +24,14 @@ lib/scrappy/extractor/selectors/uri.rb
|
|
24
24
|
lib/scrappy/extractor/selectors/uri_pattern.rb
|
25
25
|
lib/scrappy/extractor/selectors/visual.rb
|
26
26
|
lib/scrappy/extractor/selectors/xpath.rb
|
27
|
+
lib/scrappy/learning/optimizer.rb
|
28
|
+
lib/scrappy/learning/trainer.rb
|
27
29
|
lib/scrappy/repository.rb
|
28
30
|
lib/scrappy/server/admin.rb
|
29
31
|
lib/scrappy/server/errors.rb
|
30
32
|
lib/scrappy/server/helpers.rb
|
31
33
|
lib/scrappy/server/server.rb
|
32
34
|
lib/scrappy/support.rb
|
33
|
-
lib/scrappy/trainer/trainer.rb
|
34
35
|
public/favicon.ico
|
35
36
|
public/images/logo.png
|
36
37
|
public/images/logo_tiny.png
|
data/bin/scrappy
CHANGED
@@ -40,8 +40,7 @@ module Scrappy
|
|
40
40
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
41
41
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
42
42
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
43
|
-
opts.on('-r', '--reference') { Agent::Options.referenceable =
|
44
|
-
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
43
|
+
opts.on('-r', '--reference') { Agent::Options.referenceable = true }
|
45
44
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
46
45
|
opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
|
47
46
|
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
@@ -106,9 +105,12 @@ module Scrappy
|
|
106
105
|
end
|
107
106
|
def self.add_pattern graph
|
108
107
|
new_patterns = Scrappy::Kb.patterns.merge graph
|
109
|
-
|
108
|
+
save_patterns new_patterns
|
110
109
|
onload
|
111
110
|
end
|
111
|
+
def self.save_patterns new_patterns
|
112
|
+
open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
|
113
|
+
end
|
112
114
|
def self.delete_pattern uri
|
113
115
|
graph = Scrappy::Kb.patterns
|
114
116
|
fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
|
@@ -169,8 +171,7 @@ Options
|
|
169
171
|
-a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
|
170
172
|
-P, --port PORT Selects port number (default is 3434)
|
171
173
|
-t, --time TIME Returns repository data from the last given minutes
|
172
|
-
-r, --reference Outputs
|
173
|
-
-R, --reference-all Outputs all HTML referenceable data
|
174
|
+
-r, --reference Outputs reference information
|
174
175
|
|
175
176
|
Authors
|
176
177
|
José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
|
File without changes
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -28,9 +28,6 @@ module Scrappy
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
# Add references to sources if requested
|
32
|
-
triples += add_referenceable_data uri, content, triples, referenceable if referenceable
|
33
|
-
|
34
31
|
puts "done!" if self.options.debug
|
35
32
|
|
36
33
|
triples
|
@@ -38,71 +35,24 @@ module Scrappy
|
|
38
35
|
end
|
39
36
|
|
40
37
|
def fragments_for kb, uri
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
38
|
+
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
39
|
+
|
40
|
+
selectors = []
|
41
|
+
fragments = {}
|
42
|
+
root_fragments.each do |fragment|
|
43
|
+
fragment.sc::selector.each do |selector|
|
44
|
+
fragments[selector] = fragment
|
45
|
+
selectors << selector
|
46
|
+
end
|
45
47
|
end
|
46
|
-
|
47
|
-
visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
|
48
|
-
|
49
|
-
selectors = uri_selectors + visual_selectors
|
50
|
-
|
51
|
-
selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
|
52
|
-
flatten.
|
53
|
-
select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
|
54
|
-
end
|
55
|
-
|
56
|
-
private
|
57
|
-
def add_referenceable_data uri, content, given_triples, referenceable
|
58
|
-
triples = []
|
59
|
-
resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
|
60
48
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
selector.rdf::type = Node('sc:UnivocalSelector')
|
66
|
-
selector.sc::path = '/'
|
67
|
-
selector.sc::document = uri
|
49
|
+
uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
|
50
|
+
selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
|
51
|
+
select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }
|
68
52
|
|
69
|
-
|
70
|
-
|
71
|
-
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
|
72
|
-
|
73
|
-
content.search('*').each do |node|
|
74
|
-
next if node.text?
|
75
|
-
|
76
|
-
fragment = Extractor.node_hash(uri, node.path)
|
77
|
-
|
78
|
-
if referenceable == :dump or resources[fragment]
|
79
|
-
selector = ID(nil)
|
80
|
-
presentation = ID(nil)
|
81
|
-
|
82
|
-
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
83
|
-
triples << [selector, ID('sc:path'), node.path.to_s]
|
84
|
-
triples << [selector, ID('sc:tag'), node.name.to_s]
|
85
|
-
triples << [selector, ID('sc:document'), uri]
|
86
|
-
|
87
|
-
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
88
|
-
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
89
|
-
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
90
|
-
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
91
|
-
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
92
|
-
triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
|
93
|
-
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
94
|
-
triples << [presentation, ID('sc:text'), node.text.strip]
|
95
|
-
|
96
|
-
triples << [fragment, ID('sc:selector'), selector]
|
97
|
-
triples << [fragment, ID('sc:presentation'), presentation]
|
98
|
-
end
|
99
|
-
end
|
100
|
-
triples
|
101
|
-
end
|
53
|
+
visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
|
102
54
|
|
103
|
-
|
104
|
-
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
105
|
-
:"_:bnode#{digest}"
|
55
|
+
(uri_selectors + visual_selectors).map { |selector| fragments[selector] }
|
106
56
|
end
|
107
57
|
end
|
108
58
|
end
|
@@ -2,101 +2,126 @@ module Sc
|
|
2
2
|
class Fragment
|
3
3
|
include RDF::NodeProxy
|
4
4
|
|
5
|
+
# Extracts data out of a document and returns an RDF::Graph
|
6
|
+
def extract_graph options={}
|
7
|
+
graph = RDF::Graph.new
|
8
|
+
extract(options).each { |node| graph << node }
|
9
|
+
graph
|
10
|
+
end
|
11
|
+
|
12
|
+
# Extracts data out of a document and returns an array of nodes
|
5
13
|
def extract options={}
|
6
|
-
|
14
|
+
# Extracts all the mappings and any subfragment
|
15
|
+
mappings(options).map do |result|
|
16
|
+
node = result[:node]
|
17
|
+
subfragments = result[:subfragments]
|
18
|
+
doc = result[:doc]
|
19
|
+
|
20
|
+
# Process subfragments
|
21
|
+
consistent = true
|
22
|
+
subfragments.each do |subfragment|
|
23
|
+
# Get subfragment object
|
24
|
+
subfragment = subfragment.proxy Node('sc:Fragment')
|
25
|
+
|
26
|
+
# Extract data from the subfragment
|
27
|
+
subnodes = subfragment.extract(options.merge(:doc=>doc))
|
28
|
+
|
29
|
+
# Add relations
|
30
|
+
subnodes.each do |subnode|
|
31
|
+
node.graph << subnode if subnode.is_a?(RDF::Node)
|
32
|
+
subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
|
33
|
+
end
|
34
|
+
|
35
|
+
# Check consistency
|
36
|
+
consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
|
37
|
+
consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
|
38
|
+
end
|
39
|
+
|
40
|
+
# Skip the node if it has inconsistent relations
|
41
|
+
# For example: extracting a sioc:Post with no dc:title would
|
42
|
+
# violate the constraint sc:min_cardinality = 1
|
43
|
+
next if !consistent
|
44
|
+
|
45
|
+
node
|
46
|
+
end.compact
|
47
|
+
end
|
7
48
|
|
49
|
+
# Returns all the mappings between this fragment and RDF nodes
|
50
|
+
def mappings options
|
8
51
|
# Identify the fragment's mappings
|
9
52
|
docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
|
10
53
|
|
11
|
-
# Generate
|
54
|
+
# Generate a result for each page mapping
|
12
55
|
docs.map do |doc|
|
13
56
|
# Build RDF nodes from identifier selectors (if present)
|
14
|
-
|
57
|
+
node = build_node(doc, options[:referenceable])
|
15
58
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
value = doc[:value].to_s.strip
|
21
|
-
if options[:referenceable]
|
22
|
-
node.rdf::value = value
|
23
|
-
node.rdf::type = Node('rdf:Literal')
|
24
|
-
node
|
25
|
-
else
|
26
|
-
value
|
27
|
-
end
|
28
|
-
else
|
29
|
-
# Add statements about the node
|
30
|
-
sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
|
31
|
-
sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
|
32
|
-
sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
|
59
|
+
# Skip the node if no URI or bnode is created
|
60
|
+
next if !node
|
61
|
+
|
62
|
+
# Add info to the node
|
33
63
|
|
64
|
+
# Build the object -- it can be a node or a literal
|
65
|
+
object = if sc::type.include?(Node('rdf:Literal'))
|
66
|
+
value = doc[:value].to_s.strip
|
67
|
+
if options[:referenceable]
|
68
|
+
node.rdf::value = value
|
69
|
+
node.rdf::type = Node('rdf:Literal')
|
34
70
|
node
|
71
|
+
else
|
72
|
+
value
|
35
73
|
end
|
74
|
+
else
|
75
|
+
# Add statements about the node
|
76
|
+
sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
|
77
|
+
sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
|
78
|
+
sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
|
36
79
|
|
37
|
-
|
38
|
-
|
39
|
-
sc::subfragment.each do |subfragment|
|
40
|
-
# Get subfragment object
|
41
|
-
subfragment = graph.node(subfragment, Node('sc:Fragment'))
|
42
|
-
# Extract data from the subfragment
|
43
|
-
subnodes = subfragment.extract(options.merge(:doc=>doc))
|
44
|
-
|
45
|
-
# Add relations
|
46
|
-
subnodes.each do |subnode|
|
47
|
-
node.graph << subnode if subnode.is_a?(RDF::Node)
|
48
|
-
subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
|
49
|
-
end
|
50
|
-
|
51
|
-
# Check consistency
|
52
|
-
consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
|
53
|
-
consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
|
54
|
-
end
|
80
|
+
node
|
81
|
+
end
|
55
82
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
sc::type.each { |type| source.sc::type += [type] }
|
66
|
-
sc::relation.each { |relation| source.sc::relation += [relation] }
|
67
|
-
node.graph << source
|
68
|
-
node.sc::source += [source]
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# Object points to either the node or the literal
|
73
|
-
object
|
83
|
+
# Add referenceable data if requested
|
84
|
+
if options[:referenceable] and node.size > 0
|
85
|
+
source = reference(doc)
|
86
|
+
source.sc::type = sc::type
|
87
|
+
source.sc::superclass = sc::superclass
|
88
|
+
source.sc::sameas = sc::sameas
|
89
|
+
source.sc::relation = sc::relation
|
90
|
+
node.graph << source
|
91
|
+
node.sc::source = source
|
74
92
|
end
|
75
|
-
|
93
|
+
|
94
|
+
# Variable object points to either a node or a literal
|
95
|
+
# Return the object, as well as its subfragments (if any)
|
96
|
+
# and the doc it was extracted from
|
97
|
+
{ :node=>object, :subfragments=>sc::subfragment, :doc=>doc }
|
98
|
+
end.compact
|
76
99
|
end
|
77
100
|
|
78
|
-
|
79
|
-
|
80
|
-
|
101
|
+
private
|
102
|
+
# Builds a node given a document
|
103
|
+
def build_node doc, referenceable
|
104
|
+
return Node(nil) if sc::identifier.empty?
|
105
|
+
|
106
|
+
sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
|
107
|
+
node = Node(parse_uri(d[:uri], d[:value]))
|
81
108
|
|
82
109
|
if referenceable
|
83
110
|
# Include the fragment where the URI was built from
|
84
|
-
uri_node
|
85
|
-
|
86
|
-
|
87
|
-
node.sc::uri = uri_node
|
111
|
+
uri_node = Node(nil)
|
112
|
+
source = reference(d)
|
113
|
+
uri_node.graph << source
|
88
114
|
uri_node.rdf::value = node.to_s
|
89
|
-
uri_node.sc::source =
|
115
|
+
uri_node.sc::source = source
|
116
|
+
|
117
|
+
node.graph << uri_node
|
118
|
+
node.sc::uri = uri_node
|
90
119
|
end
|
91
120
|
|
92
121
|
node
|
93
|
-
end
|
94
|
-
nodes << Node(nil) if nodes.empty?
|
95
|
-
|
96
|
-
nodes
|
122
|
+
end.first
|
97
123
|
end
|
98
124
|
|
99
|
-
private
|
100
125
|
# Parses a URI by resolving relative paths
|
101
126
|
def parse_uri(uri, rel_uri)
|
102
127
|
return ID('*') if rel_uri.nil?
|
@@ -107,5 +132,41 @@ module Sc
|
|
107
132
|
end
|
108
133
|
end
|
109
134
|
|
135
|
+
# Builds an RDF reference to an HTML node
|
136
|
+
def reference doc
|
137
|
+
node = doc[:content].is_a?(Nokogiri::XML::NodeSet) ? doc[:content].first.parent : doc[:content]
|
138
|
+
attribute = doc[:attribute]
|
139
|
+
uri = doc[:uri]
|
140
|
+
|
141
|
+
source = Node(nil)
|
142
|
+
selector = Node(nil)
|
143
|
+
presentation = Node(nil)
|
144
|
+
|
145
|
+
source.graph << selector
|
146
|
+
source.sc::selector = selector
|
147
|
+
|
148
|
+
selector.rdf::type = Node('sc:UnivocalSelector')
|
149
|
+
selector.sc::path = node.path
|
150
|
+
selector.sc::document = uri
|
151
|
+
selector.sc::attribute = attribute if attribute
|
152
|
+
|
153
|
+
if node.path != '/'
|
154
|
+
selector.sc::tag = node.name
|
155
|
+
source.graph << presentation
|
156
|
+
source.sc::presentation = presentation
|
157
|
+
end
|
158
|
+
|
159
|
+
presentation.sc::x = node[:vx] if node[:vx]
|
160
|
+
presentation.sc::y = node[:vy] if node[:vy]
|
161
|
+
presentation.sc::width = node[:vw] if node[:vw]
|
162
|
+
presentation.sc::height = node[:vh] if node[:vh]
|
163
|
+
presentation.sc::font_size = node[:vsize] if node[:vsize]
|
164
|
+
presentation.sc::font_family = node[:vfont] if node[:vfont]
|
165
|
+
presentation.sc::font_weight = node[:vweight] if node[:vweight]
|
166
|
+
presentation.sc::text = node.text.strip
|
167
|
+
|
168
|
+
source
|
169
|
+
end
|
170
|
+
|
110
171
|
end
|
111
172
|
end
|
@@ -3,30 +3,33 @@ module Sc
|
|
3
3
|
def filter doc
|
4
4
|
contents = if sc::attribute.first
|
5
5
|
# Select node's attribute if given
|
6
|
-
sc::attribute.map { |attribute| doc[:content][attribute] }
|
6
|
+
sc::attribute.map { |attribute| [doc[:content][attribute], attribute] }
|
7
7
|
else
|
8
|
-
[ doc[:value] ]
|
8
|
+
[ [doc[:value], nil] ]
|
9
9
|
end
|
10
10
|
|
11
11
|
@indexes ||= Hash.new(0)
|
12
12
|
prefix = sc::prefix.first.to_s
|
13
|
-
prefix = (prefix =~ /\Ahttp
|
13
|
+
prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
|
14
14
|
suffix = sc::suffix.first.to_s
|
15
15
|
|
16
|
-
contents.map do |content|
|
17
|
-
|
18
|
-
|
16
|
+
contents.map do |content, attribute|
|
17
|
+
new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
|
18
|
+
"#{content}#{suffix}"
|
19
19
|
else
|
20
|
-
if sc::
|
21
|
-
|
20
|
+
variable = if sc::sequence.first.to_s=="true"
|
21
|
+
@indexes[prefix] += 1
|
22
22
|
else
|
23
|
-
|
23
|
+
if sc::downcase.first.to_s=="true"
|
24
|
+
content.to_s.underscore
|
25
|
+
else
|
26
|
+
content.to_s.wikify
|
27
|
+
end
|
24
28
|
end
|
29
|
+
"#{prefix}#{variable}#{suffix}"
|
25
30
|
end
|
26
31
|
|
27
|
-
new_uri
|
28
|
-
|
29
|
-
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
|
32
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
|
30
33
|
end
|
31
34
|
end
|
32
35
|
end
|
@@ -3,7 +3,7 @@ module Sc
|
|
3
3
|
def filter doc
|
4
4
|
if sc::attribute.first
|
5
5
|
# Select node's attribute if given
|
6
|
-
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
6
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute], :attribute=>attribute } }
|
7
7
|
else
|
8
8
|
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
|
9
9
|
end
|
@@ -5,7 +5,7 @@ module Sc
|
|
5
5
|
slices = doc[:value].split(separator)
|
6
6
|
sc::index.map { |index| slices[index.to_i].to_s.strip }.
|
7
7
|
select { |value| value != "" }.
|
8
|
-
map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
|
8
|
+
map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value, :attribute=>doc[:attribute]} }
|
9
9
|
end.flatten
|
10
10
|
end
|
11
11
|
end
|
@@ -1,37 +1,58 @@
|
|
1
1
|
module Sc
|
2
2
|
class VisualSelector < Selector
|
3
3
|
def filter doc
|
4
|
+
# By initializing variables, we avoid getting data from a hash (slow)
|
5
|
+
min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
|
6
|
+
max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
|
7
|
+
min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
|
8
|
+
max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
|
9
|
+
min_x = (sc::min_x.first.to_i if sc::min_x.first)
|
10
|
+
max_x = (sc::max_x.first.to_i if sc::max_x.first)
|
11
|
+
min_y = (sc::min_y.first.to_i if sc::min_y.first)
|
12
|
+
max_y = (sc::max_y.first.to_i if sc::max_y.first)
|
13
|
+
min_width = (sc::min_width.first.to_i if sc::min_width.first)
|
14
|
+
max_width = (sc::max_width.first.to_i if sc::max_width.first)
|
15
|
+
min_height = (sc::min_height.first.to_i if sc::min_height.first)
|
16
|
+
max_height = (sc::max_height.first.to_i if sc::max_height.first)
|
17
|
+
min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
|
18
|
+
max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
|
19
|
+
min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
|
20
|
+
max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
|
21
|
+
font_family = sc::font_family.first
|
22
|
+
attributes = sc::attribute
|
23
|
+
formats = sc::format
|
24
|
+
|
4
25
|
doc[:content].search(sc::tag.first || "*").select do |node|
|
5
26
|
relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
|
6
27
|
relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
|
7
28
|
|
8
29
|
!node.text? and
|
9
|
-
( !
|
10
|
-
( !
|
11
|
-
( !
|
12
|
-
( !
|
30
|
+
( !min_relative_x or relative_x >= min_relative_x) and
|
31
|
+
( !max_relative_x or relative_x <= max_relative_x) and
|
32
|
+
( !min_relative_y or relative_y >= min_relative_y) and
|
33
|
+
( !max_relative_y or relative_y <= max_relative_y) and
|
13
34
|
|
14
|
-
( !
|
15
|
-
( !
|
16
|
-
( !
|
17
|
-
( !
|
35
|
+
( !min_x or node['vx'].to_i >= min_x) and
|
36
|
+
( !max_x or node['vx'].to_i <= max_x) and
|
37
|
+
( !min_y or node['vy'].to_i >= min_y) and
|
38
|
+
( !max_y or node['vy'].to_i <= max_y) and
|
18
39
|
|
19
|
-
( !
|
20
|
-
( !
|
21
|
-
( !
|
22
|
-
( !
|
40
|
+
( !min_width or node['vw'].to_i >= min_width) and
|
41
|
+
( !max_width or node['vw'].to_i <= max_width) and
|
42
|
+
( !min_height or node['vh'].to_i >= min_height) and
|
43
|
+
( !max_height or node['vh'].to_i <= max_height) and
|
23
44
|
|
24
|
-
( !
|
25
|
-
( !
|
26
|
-
( !
|
27
|
-
( !
|
28
|
-
( !
|
45
|
+
( !min_font_size or node['vsize'].to_i >= min_font_size) and
|
46
|
+
( !max_font_size or node['vsize'].to_i <= max_font_size) and
|
47
|
+
( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
|
48
|
+
( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
|
49
|
+
( !font_family or node['vfont'] == font_family)
|
29
50
|
end.map do |content|
|
30
|
-
if
|
51
|
+
if attributes.first
|
31
52
|
# Select node's attribute if given
|
32
|
-
|
53
|
+
attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
|
33
54
|
else
|
34
|
-
[ { :uri=>doc[:uri], :content=>content, :value=>format(content,
|
55
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
|
35
56
|
end
|
36
57
|
end.flatten
|
37
58
|
end
|
@@ -11,7 +11,7 @@ module Sc
|
|
11
11
|
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
12
12
|
if sc::attribute.first
|
13
13
|
# Select node's attribute if given
|
14
|
-
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
14
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
|
15
15
|
else
|
16
16
|
# Select node
|
17
17
|
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module Optimizer
|
3
|
+
# Iterates through a knowledge base and tries to merge and generalize
|
4
|
+
# selectors whenever the output of the resulting kb is the same
|
5
|
+
def optimize kb, sample
|
6
|
+
# Get the output only once
|
7
|
+
output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
|
8
|
+
|
9
|
+
# Build an array of fragments
|
10
|
+
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
11
|
+
fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
|
12
|
+
|
13
|
+
# Parse the document
|
14
|
+
doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
|
15
|
+
|
16
|
+
begin
|
17
|
+
changed, fragments = optimize_once fragments, doc, output
|
18
|
+
end until !changed
|
19
|
+
|
20
|
+
graph = RDF::Graph.new
|
21
|
+
fragments.each { |fragment| graph << fragment }
|
22
|
+
|
23
|
+
graph
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
# Tries to optimize a set of fragments.
|
28
|
+
# Returns true if there were changes, false otherwise,
|
29
|
+
# and the new fragments as the second array element
|
30
|
+
def optimize_once fragments, doc, output
|
31
|
+
fragments.each do |fragment1|
|
32
|
+
fragments.each do |fragment2|
|
33
|
+
next if fragment1 == fragment2
|
34
|
+
new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
|
35
|
+
|
36
|
+
# End if a new fragment was created
|
37
|
+
if new_fragment
|
38
|
+
return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
[false, fragments]
|
43
|
+
end
|
44
|
+
|
45
|
+
def mix_if_gain fragment1, fragment2, doc, output
|
46
|
+
# Won't get gain if the fragment does not produce the same kind of RDF resource
|
47
|
+
return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
48
|
+
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
49
|
+
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
50
|
+
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
51
|
+
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
52
|
+
|
53
|
+
# Build new fragment
|
54
|
+
new_fragment = Node(nil)
|
55
|
+
new_fragment.rdf::type = fragment1.rdf::type
|
56
|
+
new_fragment.sc::type = fragment1.sc::type
|
57
|
+
new_fragment.sc::relation = fragment1.sc::relation
|
58
|
+
new_fragment.sc::superclass = fragment1.sc::superclass
|
59
|
+
new_fragment.sc::sameas = fragment1.sc::sameas
|
60
|
+
|
61
|
+
# sc:selector
|
62
|
+
selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
|
63
|
+
new_fragment.graph << selector
|
64
|
+
new_fragment.sc::selector = selector
|
65
|
+
|
66
|
+
# sc:identifier
|
67
|
+
if fragment1.sc::identifier.first
|
68
|
+
selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
|
69
|
+
new_fragment.graph << selector
|
70
|
+
new_fragment.sc::identifier = selector
|
71
|
+
end
|
72
|
+
|
73
|
+
# sc:subfragment
|
74
|
+
all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
|
75
|
+
all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
|
76
|
+
subfragments = all_subfragments.select do |sf|
|
77
|
+
sf.sc::type.sort_by(&:to_s) == types and
|
78
|
+
sf.sc::relation.sort_by(&:to_s) == relations
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Check new output
|
83
|
+
separate_output1 = fragment1.extract_graph :doc=>doc
|
84
|
+
separate_output2 = fragment2.extract_graph :doc=>doc
|
85
|
+
separate_output = separate_output1.merge separate_output2
|
86
|
+
new_output = new_fragment.proxy.extract_graph :doc=>doc
|
87
|
+
|
88
|
+
# Check if the output with the new fragment is a subset of the full output
|
89
|
+
# and if the output of the fragments alone is a subset of the output of the new
|
90
|
+
# fragment. This way we ensure the output is the same without using all the
|
91
|
+
# fragments that are available in the knowledge base.
|
92
|
+
new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
|
93
|
+
end
|
94
|
+
|
95
|
+
def generalize_selectors selectors
|
96
|
+
selector = Node(nil)
|
97
|
+
selector.rdf::type = Node('sc:VisualSelector')
|
98
|
+
selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
|
99
|
+
selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
|
100
|
+
selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
|
101
|
+
selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
|
102
|
+
selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
|
103
|
+
selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
|
104
|
+
selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
|
105
|
+
selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
|
106
|
+
selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
|
107
|
+
selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
|
108
|
+
selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
|
109
|
+
selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
|
110
|
+
selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
|
111
|
+
selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
|
112
|
+
selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
|
113
|
+
selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
|
114
|
+
selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family }.flatten.uniq.size == 1
|
115
|
+
selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag }.flatten.uniq.size == 1
|
116
|
+
selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute }.flatten.uniq.size == 1
|
117
|
+
|
118
|
+
selector
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -6,10 +6,6 @@ module Scrappy
|
|
6
6
|
triples + train_sample(sample).triples
|
7
7
|
end )
|
8
8
|
end
|
9
|
-
|
10
|
-
# Optimizes the knowledge base by generalizing patterns
|
11
|
-
def optimize
|
12
|
-
end
|
13
9
|
|
14
10
|
private
|
15
11
|
def train_sample sample
|
@@ -34,37 +30,31 @@ module Scrappy
|
|
34
30
|
fragment.graph << selector
|
35
31
|
fragment.sc::selector = selector
|
36
32
|
when ID("sc:uri") then
|
37
|
-
# Assumption: URIs are extracted from a link
|
38
33
|
selector = selector_for(node.sc::uri.first.sc::source.first, node)
|
39
|
-
selector.sc::tag = "a"
|
40
|
-
selector.sc::attribute = "href"
|
41
|
-
|
42
34
|
fragment.graph << selector
|
43
35
|
fragment.sc::identifier = selector
|
44
36
|
when ID("rdf:type") then
|
45
37
|
fragment.sc::type = node.rdf::type
|
46
38
|
else
|
47
|
-
if node[predicate].map(&:class).uniq.first
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
graph = RDF::Graph.new( subfragments.inject([]) do |triples, subfragment|
|
52
|
-
triples + subfragment.graph.triples.map { |s,p,o| [s==subfragment.id ? id : s,p,o] }
|
53
|
-
end )
|
54
|
-
subfragment = graph[id]
|
55
|
-
subfragment.sc::relation = Node(predicate)
|
56
|
-
subfragment.sc::min_cardinality = "1"
|
39
|
+
if node[predicate].map(&:class).uniq.first == RDF::Node
|
40
|
+
node[predicate].map do |subnode|
|
41
|
+
subfragment = fragment_for(subnode, node)
|
42
|
+
subfragment.sc::relation = Node(predicate)
|
57
43
|
|
58
|
-
|
59
|
-
|
44
|
+
fragment.graph << subfragment
|
45
|
+
fragment.sc::subfragment += [subfragment]
|
46
|
+
end
|
60
47
|
end
|
61
48
|
end
|
62
49
|
end
|
63
|
-
fragment.rdf::type = Node("sc:Fragment")
|
50
|
+
fragment.rdf::type = Node("sc:Fragment")
|
51
|
+
fragment.sc::min_cardinality = "1"
|
52
|
+
fragment.sc::max_cardinality = "1"
|
64
53
|
fragment
|
65
54
|
end
|
66
55
|
|
67
56
|
def selector_for fragment, parent=nil
|
57
|
+
fragment_selector = fragment.sc::selector.first
|
68
58
|
presentation = fragment.sc::presentation.first
|
69
59
|
|
70
60
|
selector = Node(nil)
|
@@ -94,6 +84,9 @@ module Scrappy
|
|
94
84
|
selector.sc::min_font_weight = presentation.sc::font_weight
|
95
85
|
selector.sc::max_font_weight = presentation.sc::font_weight
|
96
86
|
selector.sc::font_family = presentation.sc::font_family
|
87
|
+
|
88
|
+
selector.sc::tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
|
89
|
+
selector.sc::attribute = fragment_selector.sc::attribute
|
97
90
|
|
98
91
|
selector
|
99
92
|
end
|
data/lib/scrappy/server/admin.rb
CHANGED
@@ -62,7 +62,7 @@ module Scrappy
|
|
62
62
|
map { |node| node.sc::type }.flatten.map(&:to_s).sort
|
63
63
|
haml :patterns
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
app.delete '/patterns/*' do |uri|
|
67
67
|
Scrappy::App.delete_pattern uri
|
68
68
|
flash[:notice] = "Pattern deleted"
|
@@ -94,6 +94,12 @@ module Scrappy
|
|
94
94
|
redirect "#{settings.base_uri}/samples"
|
95
95
|
end
|
96
96
|
|
97
|
+
app.post '/samples/:id/optimize' do |id|
|
98
|
+
Scrappy::App.save_patterns agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
|
99
|
+
flash[:notice] = "Optimization completed"
|
100
|
+
redirect "#{settings.base_uri}/samples"
|
101
|
+
end
|
102
|
+
|
97
103
|
app.post '/samples' do
|
98
104
|
html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
|
99
105
|
sample = Scrappy::App.add_sample(:html=>html, :uri=>params[:uri], :date=>Time.now)
|
data/lib/scrappy/support.rb
CHANGED
@@ -29,4 +29,28 @@ class String
|
|
29
29
|
tr("-", "_").
|
30
30
|
downcase
|
31
31
|
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class Array
|
35
|
+
# Return true if a given array has the same elements as this one
|
36
|
+
def equivalent? array
|
37
|
+
self.all? { |i| array.include?(i) } and
|
38
|
+
array.all? { |i| self.include?(i) }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module RDF
|
43
|
+
class Node
|
44
|
+
def self.mix *nodes
|
45
|
+
id = nodes.first
|
46
|
+
graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
|
47
|
+
triples + node.graph.triples.map do |s,p,o|
|
48
|
+
[ s==node.id ? id : s,
|
49
|
+
p==node.id ? id : p,
|
50
|
+
o==node.id ? id : o ]
|
51
|
+
end
|
52
|
+
end )
|
53
|
+
graph[id]
|
54
|
+
end
|
55
|
+
end
|
32
56
|
end
|
data/lib/scrappy.rb
CHANGED
@@ -15,7 +15,8 @@ require 'scrappy/support'
|
|
15
15
|
require 'scrappy/repository'
|
16
16
|
|
17
17
|
require 'scrappy/extractor/extractor'
|
18
|
-
require 'scrappy/
|
18
|
+
require 'scrappy/learning/trainer'
|
19
|
+
require 'scrappy/learning/optimizer'
|
19
20
|
require 'scrappy/agent/map_reduce'
|
20
21
|
require 'scrappy/agent/cache'
|
21
22
|
require 'scrappy/agent/dumper'
|
@@ -23,5 +24,5 @@ require 'scrappy/agent/blind_agent'
|
|
23
24
|
require 'scrappy/agent/agent'
|
24
25
|
|
25
26
|
module Scrappy
|
26
|
-
VERSION = '0.3.
|
27
|
+
VERSION = '0.3.3'
|
27
28
|
end
|
@@ -16,16 +16,28 @@ var add_visual_data = function() {
|
|
16
16
|
item.setAttribute('vy', y);
|
17
17
|
item.setAttribute('vw', item.offsetWidth);
|
18
18
|
item.setAttribute('vh', item.offsetHeight);
|
19
|
-
|
20
|
-
|
21
|
-
item.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
19
|
+
|
20
|
+
var item_with_text = false;
|
21
|
+
for (var k=0; k<item.childNodes.length; k++) {
|
22
|
+
child = item.childNodes[k]
|
23
|
+
if (child.nodeName == "#text" && child.textContent.trim() != "") {
|
24
|
+
item_with_text = true;
|
25
|
+
break;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
if (item_with_text) {
|
30
|
+
var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
|
31
|
+
size = size.substring(0, size.length-2);
|
32
|
+
item.setAttribute('vsize', size);
|
33
|
+
var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
|
34
|
+
var font = fonts[fonts.length-1].trim();
|
35
|
+
item.setAttribute('vfont', font);
|
36
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
37
|
+
if (weight == 'normal') weight = 400;
|
38
|
+
if (weight == 'bold') weight = 700;
|
39
|
+
item.setAttribute('vweight', weight);
|
40
|
+
}
|
29
41
|
}
|
30
42
|
}
|
31
43
|
|
data/scrappy.gemspec
CHANGED
@@ -2,30 +2,30 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-25}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
s.rubyforge_project = %q{scrappy}
|
20
|
-
s.rubygems_version = %q{1.3.
|
20
|
+
s.rubygems_version = %q{1.3.6}
|
21
21
|
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
22
|
-
s.test_files = ["test/
|
22
|
+
s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
|
23
23
|
|
24
24
|
if s.respond_to? :specification_version then
|
25
25
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
26
|
s.specification_version = 3
|
27
27
|
|
28
|
-
if Gem::Version.new(Gem::
|
28
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
29
29
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
|
30
30
|
s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
|
31
31
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
data/views/samples.haml
CHANGED
@@ -21,6 +21,8 @@
|
|
21
21
|
-[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
|
22
22
|
%span.format
|
23
23
|
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
24
|
+
%span.format
|
25
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/optimize", :'data-method'=>:post} Optimize
|
24
26
|
%span.format
|
25
27
|
%a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
|
26
28
|
%span.date
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrappy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 23
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
8
|
+
- 3
|
9
|
+
version: 0.3.3
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jose Ignacio
|
@@ -15,18 +14,16 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-25 00:00:00 +01:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: activesupport
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
24
|
requirements:
|
27
25
|
- - ">="
|
28
26
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 9
|
30
27
|
segments:
|
31
28
|
- 2
|
32
29
|
- 3
|
@@ -38,11 +35,9 @@ dependencies:
|
|
38
35
|
name: sinatra
|
39
36
|
prerelease: false
|
40
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
38
|
requirements:
|
43
39
|
- - ">="
|
44
40
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 23
|
46
41
|
segments:
|
47
42
|
- 1
|
48
43
|
- 1
|
@@ -54,11 +49,9 @@ dependencies:
|
|
54
49
|
name: thin
|
55
50
|
prerelease: false
|
56
51
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
52
|
requirements:
|
59
53
|
- - ">="
|
60
54
|
- !ruby/object:Gem::Version
|
61
|
-
hash: 17
|
62
55
|
segments:
|
63
56
|
- 1
|
64
57
|
- 2
|
@@ -70,11 +63,9 @@ dependencies:
|
|
70
63
|
name: nokogiri
|
71
64
|
prerelease: false
|
72
65
|
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
66
|
requirements:
|
75
67
|
- - ">="
|
76
68
|
- !ruby/object:Gem::Version
|
77
|
-
hash: 5
|
78
69
|
segments:
|
79
70
|
- 1
|
80
71
|
- 4
|
@@ -86,11 +77,9 @@ dependencies:
|
|
86
77
|
name: mechanize
|
87
78
|
prerelease: false
|
88
79
|
requirement: &id005 !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
80
|
requirements:
|
91
81
|
- - ">="
|
92
82
|
- !ruby/object:Gem::Version
|
93
|
-
hash: 23
|
94
83
|
segments:
|
95
84
|
- 1
|
96
85
|
- 0
|
@@ -102,11 +91,9 @@ dependencies:
|
|
102
91
|
name: lightrdf
|
103
92
|
prerelease: false
|
104
93
|
requirement: &id006 !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
94
|
requirements:
|
107
95
|
- - ">="
|
108
96
|
- !ruby/object:Gem::Version
|
109
|
-
hash: 19
|
110
97
|
segments:
|
111
98
|
- 0
|
112
99
|
- 3
|
@@ -118,11 +105,9 @@ dependencies:
|
|
118
105
|
name: i18n
|
119
106
|
prerelease: false
|
120
107
|
requirement: &id007 !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
108
|
requirements:
|
123
109
|
- - ">="
|
124
110
|
- !ruby/object:Gem::Version
|
125
|
-
hash: 11
|
126
111
|
segments:
|
127
112
|
- 0
|
128
113
|
- 4
|
@@ -134,11 +119,9 @@ dependencies:
|
|
134
119
|
name: rest-client
|
135
120
|
prerelease: false
|
136
121
|
requirement: &id008 !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
122
|
requirements:
|
139
123
|
- - ">="
|
140
124
|
- !ruby/object:Gem::Version
|
141
|
-
hash: 13
|
142
125
|
segments:
|
143
126
|
- 1
|
144
127
|
- 6
|
@@ -150,11 +133,9 @@ dependencies:
|
|
150
133
|
name: haml
|
151
134
|
prerelease: false
|
152
135
|
requirement: &id009 !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
136
|
requirements:
|
155
137
|
- - ">="
|
156
138
|
- !ruby/object:Gem::Version
|
157
|
-
hash: 55
|
158
139
|
segments:
|
159
140
|
- 3
|
160
141
|
- 0
|
@@ -166,11 +147,9 @@ dependencies:
|
|
166
147
|
name: rack-flash
|
167
148
|
prerelease: false
|
168
149
|
requirement: &id010 !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
150
|
requirements:
|
171
151
|
- - ">="
|
172
152
|
- !ruby/object:Gem::Version
|
173
|
-
hash: 25
|
174
153
|
segments:
|
175
154
|
- 0
|
176
155
|
- 1
|
@@ -187,6 +166,7 @@ extensions: []
|
|
187
166
|
extra_rdoc_files:
|
188
167
|
- README.rdoc
|
189
168
|
- bin/scrappy
|
169
|
+
- extractors/elmundo.yarf
|
190
170
|
- lib/scrappy.rb
|
191
171
|
- lib/scrappy/agent/agent.rb
|
192
172
|
- lib/scrappy/agent/blind_agent.rb
|
@@ -207,20 +187,21 @@ extra_rdoc_files:
|
|
207
187
|
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
208
188
|
- lib/scrappy/extractor/selectors/visual.rb
|
209
189
|
- lib/scrappy/extractor/selectors/xpath.rb
|
190
|
+
- lib/scrappy/learning/optimizer.rb
|
191
|
+
- lib/scrappy/learning/trainer.rb
|
210
192
|
- lib/scrappy/repository.rb
|
211
193
|
- lib/scrappy/server/admin.rb
|
212
194
|
- lib/scrappy/server/errors.rb
|
213
195
|
- lib/scrappy/server/helpers.rb
|
214
196
|
- lib/scrappy/server/server.rb
|
215
197
|
- lib/scrappy/support.rb
|
216
|
-
- lib/scrappy/trainer/trainer.rb
|
217
198
|
files:
|
218
199
|
- History.txt
|
219
200
|
- Manifest
|
220
201
|
- README.rdoc
|
221
202
|
- Rakefile
|
222
203
|
- bin/scrappy
|
223
|
-
-
|
204
|
+
- extractors/elmundo.yarf
|
224
205
|
- lib/scrappy.rb
|
225
206
|
- lib/scrappy/agent/agent.rb
|
226
207
|
- lib/scrappy/agent/blind_agent.rb
|
@@ -241,13 +222,14 @@ files:
|
|
241
222
|
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
242
223
|
- lib/scrappy/extractor/selectors/visual.rb
|
243
224
|
- lib/scrappy/extractor/selectors/xpath.rb
|
225
|
+
- lib/scrappy/learning/optimizer.rb
|
226
|
+
- lib/scrappy/learning/trainer.rb
|
244
227
|
- lib/scrappy/repository.rb
|
245
228
|
- lib/scrappy/server/admin.rb
|
246
229
|
- lib/scrappy/server/errors.rb
|
247
230
|
- lib/scrappy/server/helpers.rb
|
248
231
|
- lib/scrappy/server/server.rb
|
249
232
|
- lib/scrappy/support.rb
|
250
|
-
- lib/scrappy/trainer/trainer.rb
|
251
233
|
- public/favicon.ico
|
252
234
|
- public/images/logo.png
|
253
235
|
- public/images/logo_tiny.png
|
@@ -278,20 +260,16 @@ rdoc_options:
|
|
278
260
|
require_paths:
|
279
261
|
- lib
|
280
262
|
required_ruby_version: !ruby/object:Gem::Requirement
|
281
|
-
none: false
|
282
263
|
requirements:
|
283
264
|
- - ">="
|
284
265
|
- !ruby/object:Gem::Version
|
285
|
-
hash: 3
|
286
266
|
segments:
|
287
267
|
- 0
|
288
268
|
version: "0"
|
289
269
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
290
|
-
none: false
|
291
270
|
requirements:
|
292
271
|
- - ">="
|
293
272
|
- !ruby/object:Gem::Version
|
294
|
-
hash: 11
|
295
273
|
segments:
|
296
274
|
- 1
|
297
275
|
- 2
|
@@ -299,10 +277,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
299
277
|
requirements: []
|
300
278
|
|
301
279
|
rubyforge_project: scrappy
|
302
|
-
rubygems_version: 1.3.
|
280
|
+
rubygems_version: 1.3.6
|
303
281
|
signing_key:
|
304
282
|
specification_version: 3
|
305
283
|
summary: Web scraper that allows producing RDF data out of plain web pages
|
306
284
|
test_files:
|
307
|
-
- test/test_helper.rb
|
308
285
|
- test/test_scrappy.rb
|
286
|
+
- test/test_helper.rb
|