scrappy 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest +3 -2
- data/bin/scrappy +6 -5
- data/{kb → extractors}/elmundo.yarf +0 -0
- data/lib/scrappy/agent/agent.rb +1 -0
- data/lib/scrappy/extractor/extractor.rb +14 -64
- data/lib/scrappy/extractor/fragment.rb +131 -70
- data/lib/scrappy/extractor/selectors/new_uri.rb +15 -12
- data/lib/scrappy/extractor/selectors/root.rb +1 -1
- data/lib/scrappy/extractor/selectors/slice.rb +1 -1
- data/lib/scrappy/extractor/selectors/visual.rb +41 -20
- data/lib/scrappy/extractor/selectors/xpath.rb +1 -1
- data/lib/scrappy/learning/optimizer.rb +121 -0
- data/lib/scrappy/{trainer → learning}/trainer.rb +14 -21
- data/lib/scrappy/server/admin.rb +7 -1
- data/lib/scrappy/support.rb +24 -0
- data/lib/scrappy.rb +3 -2
- data/public/javascripts/annotator.js +22 -10
- data/public/stylesheets/application.css +1 -1
- data/scrappy.gemspec +7 -7
- data/views/samples.haml +2 -0
- metadata +11 -33
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -3,7 +3,7 @@ Manifest
|
|
3
3
|
README.rdoc
|
4
4
|
Rakefile
|
5
5
|
bin/scrappy
|
6
|
-
|
6
|
+
extractors/elmundo.yarf
|
7
7
|
lib/scrappy.rb
|
8
8
|
lib/scrappy/agent/agent.rb
|
9
9
|
lib/scrappy/agent/blind_agent.rb
|
@@ -24,13 +24,14 @@ lib/scrappy/extractor/selectors/uri.rb
|
|
24
24
|
lib/scrappy/extractor/selectors/uri_pattern.rb
|
25
25
|
lib/scrappy/extractor/selectors/visual.rb
|
26
26
|
lib/scrappy/extractor/selectors/xpath.rb
|
27
|
+
lib/scrappy/learning/optimizer.rb
|
28
|
+
lib/scrappy/learning/trainer.rb
|
27
29
|
lib/scrappy/repository.rb
|
28
30
|
lib/scrappy/server/admin.rb
|
29
31
|
lib/scrappy/server/errors.rb
|
30
32
|
lib/scrappy/server/helpers.rb
|
31
33
|
lib/scrappy/server/server.rb
|
32
34
|
lib/scrappy/support.rb
|
33
|
-
lib/scrappy/trainer/trainer.rb
|
34
35
|
public/favicon.ico
|
35
36
|
public/images/logo.png
|
36
37
|
public/images/logo_tiny.png
|
data/bin/scrappy
CHANGED
@@ -40,8 +40,7 @@ module Scrappy
|
|
40
40
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
41
41
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
42
42
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
43
|
-
opts.on('-r', '--reference') { Agent::Options.referenceable =
|
44
|
-
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
43
|
+
opts.on('-r', '--reference') { Agent::Options.referenceable = true }
|
45
44
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
46
45
|
opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
|
47
46
|
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
@@ -106,9 +105,12 @@ module Scrappy
|
|
106
105
|
end
|
107
106
|
def self.add_pattern graph
|
108
107
|
new_patterns = Scrappy::Kb.patterns.merge graph
|
109
|
-
|
108
|
+
save_patterns new_patterns
|
110
109
|
onload
|
111
110
|
end
|
111
|
+
def self.save_patterns new_patterns
|
112
|
+
open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
|
113
|
+
end
|
112
114
|
def self.delete_pattern uri
|
113
115
|
graph = Scrappy::Kb.patterns
|
114
116
|
fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
|
@@ -169,8 +171,7 @@ Options
|
|
169
171
|
-a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
|
170
172
|
-P, --port PORT Selects port number (default is 3434)
|
171
173
|
-t, --time TIME Returns repository data from the last given minutes
|
172
|
-
-r, --reference Outputs
|
173
|
-
-R, --reference-all Outputs all HTML referenceable data
|
174
|
+
-r, --reference Outputs reference information
|
174
175
|
|
175
176
|
Authors
|
176
177
|
José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
|
File without changes
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -28,9 +28,6 @@ module Scrappy
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
# Add references to sources if requested
|
32
|
-
triples += add_referenceable_data uri, content, triples, referenceable if referenceable
|
33
|
-
|
34
31
|
puts "done!" if self.options.debug
|
35
32
|
|
36
33
|
triples
|
@@ -38,71 +35,24 @@ module Scrappy
|
|
38
35
|
end
|
39
36
|
|
40
37
|
def fragments_for kb, uri
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
38
|
+
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
39
|
+
|
40
|
+
selectors = []
|
41
|
+
fragments = {}
|
42
|
+
root_fragments.each do |fragment|
|
43
|
+
fragment.sc::selector.each do |selector|
|
44
|
+
fragments[selector] = fragment
|
45
|
+
selectors << selector
|
46
|
+
end
|
45
47
|
end
|
46
|
-
|
47
|
-
visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
|
48
|
-
|
49
|
-
selectors = uri_selectors + visual_selectors
|
50
|
-
|
51
|
-
selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
|
52
|
-
flatten.
|
53
|
-
select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
|
54
|
-
end
|
55
|
-
|
56
|
-
private
|
57
|
-
def add_referenceable_data uri, content, given_triples, referenceable
|
58
|
-
triples = []
|
59
|
-
resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
|
60
48
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
selector.rdf::type = Node('sc:UnivocalSelector')
|
66
|
-
selector.sc::path = '/'
|
67
|
-
selector.sc::document = uri
|
49
|
+
uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
|
50
|
+
selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
|
51
|
+
select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }
|
68
52
|
|
69
|
-
|
70
|
-
|
71
|
-
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
|
72
|
-
|
73
|
-
content.search('*').each do |node|
|
74
|
-
next if node.text?
|
75
|
-
|
76
|
-
fragment = Extractor.node_hash(uri, node.path)
|
77
|
-
|
78
|
-
if referenceable == :dump or resources[fragment]
|
79
|
-
selector = ID(nil)
|
80
|
-
presentation = ID(nil)
|
81
|
-
|
82
|
-
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
83
|
-
triples << [selector, ID('sc:path'), node.path.to_s]
|
84
|
-
triples << [selector, ID('sc:tag'), node.name.to_s]
|
85
|
-
triples << [selector, ID('sc:document'), uri]
|
86
|
-
|
87
|
-
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
88
|
-
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
89
|
-
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
90
|
-
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
91
|
-
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
92
|
-
triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
|
93
|
-
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
94
|
-
triples << [presentation, ID('sc:text'), node.text.strip]
|
95
|
-
|
96
|
-
triples << [fragment, ID('sc:selector'), selector]
|
97
|
-
triples << [fragment, ID('sc:presentation'), presentation]
|
98
|
-
end
|
99
|
-
end
|
100
|
-
triples
|
101
|
-
end
|
53
|
+
visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
|
102
54
|
|
103
|
-
|
104
|
-
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
105
|
-
:"_:bnode#{digest}"
|
55
|
+
(uri_selectors + visual_selectors).map { |selector| fragments[selector] }
|
106
56
|
end
|
107
57
|
end
|
108
58
|
end
|
@@ -2,101 +2,126 @@ module Sc
|
|
2
2
|
class Fragment
|
3
3
|
include RDF::NodeProxy
|
4
4
|
|
5
|
+
# Extracts data out of a document and returns an RDF::Graph
|
6
|
+
def extract_graph options={}
|
7
|
+
graph = RDF::Graph.new
|
8
|
+
extract(options).each { |node| graph << node }
|
9
|
+
graph
|
10
|
+
end
|
11
|
+
|
12
|
+
# Extracts data out of a document and returns an array of nodes
|
5
13
|
def extract options={}
|
6
|
-
|
14
|
+
# Extracts all the mappings and any subfragment
|
15
|
+
mappings(options).map do |result|
|
16
|
+
node = result[:node]
|
17
|
+
subfragments = result[:subfragments]
|
18
|
+
doc = result[:doc]
|
19
|
+
|
20
|
+
# Process subfragments
|
21
|
+
consistent = true
|
22
|
+
subfragments.each do |subfragment|
|
23
|
+
# Get subfragment object
|
24
|
+
subfragment = subfragment.proxy Node('sc:Fragment')
|
25
|
+
|
26
|
+
# Extract data from the subfragment
|
27
|
+
subnodes = subfragment.extract(options.merge(:doc=>doc))
|
28
|
+
|
29
|
+
# Add relations
|
30
|
+
subnodes.each do |subnode|
|
31
|
+
node.graph << subnode if subnode.is_a?(RDF::Node)
|
32
|
+
subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
|
33
|
+
end
|
34
|
+
|
35
|
+
# Check consistency
|
36
|
+
consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
|
37
|
+
consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
|
38
|
+
end
|
39
|
+
|
40
|
+
# Skip the node if it has inconsistent relations
|
41
|
+
# For example: extracting a sioc:Post with no dc:title would
|
42
|
+
# violate the constraint sc:min_cardinality = 1
|
43
|
+
next if !consistent
|
44
|
+
|
45
|
+
node
|
46
|
+
end.compact
|
47
|
+
end
|
7
48
|
|
49
|
+
# Returns all the mappings between this fragment and RDF nodes
|
50
|
+
def mappings options
|
8
51
|
# Identify the fragment's mappings
|
9
52
|
docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
|
10
53
|
|
11
|
-
# Generate
|
54
|
+
# Generate a result for each page mapping
|
12
55
|
docs.map do |doc|
|
13
56
|
# Build RDF nodes from identifier selectors (if present)
|
14
|
-
|
57
|
+
node = build_node(doc, options[:referenceable])
|
15
58
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
value = doc[:value].to_s.strip
|
21
|
-
if options[:referenceable]
|
22
|
-
node.rdf::value = value
|
23
|
-
node.rdf::type = Node('rdf:Literal')
|
24
|
-
node
|
25
|
-
else
|
26
|
-
value
|
27
|
-
end
|
28
|
-
else
|
29
|
-
# Add statements about the node
|
30
|
-
sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
|
31
|
-
sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
|
32
|
-
sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
|
59
|
+
# Skip the node if no URI or bnode is created
|
60
|
+
next if !node
|
61
|
+
|
62
|
+
# Add info to the node
|
33
63
|
|
64
|
+
# Build the object -- it can be a node or a literal
|
65
|
+
object = if sc::type.include?(Node('rdf:Literal'))
|
66
|
+
value = doc[:value].to_s.strip
|
67
|
+
if options[:referenceable]
|
68
|
+
node.rdf::value = value
|
69
|
+
node.rdf::type = Node('rdf:Literal')
|
34
70
|
node
|
71
|
+
else
|
72
|
+
value
|
35
73
|
end
|
74
|
+
else
|
75
|
+
# Add statements about the node
|
76
|
+
sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
|
77
|
+
sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
|
78
|
+
sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
|
36
79
|
|
37
|
-
|
38
|
-
|
39
|
-
sc::subfragment.each do |subfragment|
|
40
|
-
# Get subfragment object
|
41
|
-
subfragment = graph.node(subfragment, Node('sc:Fragment'))
|
42
|
-
# Extract data from the subfragment
|
43
|
-
subnodes = subfragment.extract(options.merge(:doc=>doc))
|
44
|
-
|
45
|
-
# Add relations
|
46
|
-
subnodes.each do |subnode|
|
47
|
-
node.graph << subnode if subnode.is_a?(RDF::Node)
|
48
|
-
subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
|
49
|
-
end
|
50
|
-
|
51
|
-
# Check consistency
|
52
|
-
consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
|
53
|
-
consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
|
54
|
-
end
|
80
|
+
node
|
81
|
+
end
|
55
82
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
sc::type.each { |type| source.sc::type += [type] }
|
66
|
-
sc::relation.each { |relation| source.sc::relation += [relation] }
|
67
|
-
node.graph << source
|
68
|
-
node.sc::source += [source]
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# Object points to either the node or the literal
|
73
|
-
object
|
83
|
+
# Add referenceable data if requested
|
84
|
+
if options[:referenceable] and node.size > 0
|
85
|
+
source = reference(doc)
|
86
|
+
source.sc::type = sc::type
|
87
|
+
source.sc::superclass = sc::superclass
|
88
|
+
source.sc::sameas = sc::sameas
|
89
|
+
source.sc::relation = sc::relation
|
90
|
+
node.graph << source
|
91
|
+
node.sc::source = source
|
74
92
|
end
|
75
|
-
|
93
|
+
|
94
|
+
# Variable object points to either a node or a literal
|
95
|
+
# Return the object, as well as its subfragments (if any)
|
96
|
+
# and the doc it was extracted from
|
97
|
+
{ :node=>object, :subfragments=>sc::subfragment, :doc=>doc }
|
98
|
+
end.compact
|
76
99
|
end
|
77
100
|
|
78
|
-
|
79
|
-
|
80
|
-
|
101
|
+
private
|
102
|
+
# Builds a node given a document
|
103
|
+
def build_node doc, referenceable
|
104
|
+
return Node(nil) if sc::identifier.empty?
|
105
|
+
|
106
|
+
sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
|
107
|
+
node = Node(parse_uri(d[:uri], d[:value]))
|
81
108
|
|
82
109
|
if referenceable
|
83
110
|
# Include the fragment where the URI was built from
|
84
|
-
uri_node
|
85
|
-
|
86
|
-
|
87
|
-
node.sc::uri = uri_node
|
111
|
+
uri_node = Node(nil)
|
112
|
+
source = reference(d)
|
113
|
+
uri_node.graph << source
|
88
114
|
uri_node.rdf::value = node.to_s
|
89
|
-
uri_node.sc::source =
|
115
|
+
uri_node.sc::source = source
|
116
|
+
|
117
|
+
node.graph << uri_node
|
118
|
+
node.sc::uri = uri_node
|
90
119
|
end
|
91
120
|
|
92
121
|
node
|
93
|
-
end
|
94
|
-
nodes << Node(nil) if nodes.empty?
|
95
|
-
|
96
|
-
nodes
|
122
|
+
end.first
|
97
123
|
end
|
98
124
|
|
99
|
-
private
|
100
125
|
# Parses a URI by resolving relative paths
|
101
126
|
def parse_uri(uri, rel_uri)
|
102
127
|
return ID('*') if rel_uri.nil?
|
@@ -107,5 +132,41 @@ module Sc
|
|
107
132
|
end
|
108
133
|
end
|
109
134
|
|
135
|
+
# Builds an RDF reference to an HTML node
|
136
|
+
def reference doc
|
137
|
+
node = doc[:content].is_a?(Nokogiri::XML::NodeSet) ? doc[:content].first.parent : doc[:content]
|
138
|
+
attribute = doc[:attribute]
|
139
|
+
uri = doc[:uri]
|
140
|
+
|
141
|
+
source = Node(nil)
|
142
|
+
selector = Node(nil)
|
143
|
+
presentation = Node(nil)
|
144
|
+
|
145
|
+
source.graph << selector
|
146
|
+
source.sc::selector = selector
|
147
|
+
|
148
|
+
selector.rdf::type = Node('sc:UnivocalSelector')
|
149
|
+
selector.sc::path = node.path
|
150
|
+
selector.sc::document = uri
|
151
|
+
selector.sc::attribute = attribute if attribute
|
152
|
+
|
153
|
+
if node.path != '/'
|
154
|
+
selector.sc::tag = node.name
|
155
|
+
source.graph << presentation
|
156
|
+
source.sc::presentation = presentation
|
157
|
+
end
|
158
|
+
|
159
|
+
presentation.sc::x = node[:vx] if node[:vx]
|
160
|
+
presentation.sc::y = node[:vy] if node[:vy]
|
161
|
+
presentation.sc::width = node[:vw] if node[:vw]
|
162
|
+
presentation.sc::height = node[:vh] if node[:vh]
|
163
|
+
presentation.sc::font_size = node[:vsize] if node[:vsize]
|
164
|
+
presentation.sc::font_family = node[:vfont] if node[:vfont]
|
165
|
+
presentation.sc::font_weight = node[:vweight] if node[:vweight]
|
166
|
+
presentation.sc::text = node.text.strip
|
167
|
+
|
168
|
+
source
|
169
|
+
end
|
170
|
+
|
110
171
|
end
|
111
172
|
end
|
@@ -3,30 +3,33 @@ module Sc
|
|
3
3
|
def filter doc
|
4
4
|
contents = if sc::attribute.first
|
5
5
|
# Select node's attribute if given
|
6
|
-
sc::attribute.map { |attribute| doc[:content][attribute] }
|
6
|
+
sc::attribute.map { |attribute| [doc[:content][attribute], attribute] }
|
7
7
|
else
|
8
|
-
[ doc[:value] ]
|
8
|
+
[ [doc[:value], nil] ]
|
9
9
|
end
|
10
10
|
|
11
11
|
@indexes ||= Hash.new(0)
|
12
12
|
prefix = sc::prefix.first.to_s
|
13
|
-
prefix = (prefix =~ /\Ahttp
|
13
|
+
prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
|
14
14
|
suffix = sc::suffix.first.to_s
|
15
15
|
|
16
|
-
contents.map do |content|
|
17
|
-
|
18
|
-
|
16
|
+
contents.map do |content, attribute|
|
17
|
+
new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
|
18
|
+
"#{content}#{suffix}"
|
19
19
|
else
|
20
|
-
if sc::
|
21
|
-
|
20
|
+
variable = if sc::sequence.first.to_s=="true"
|
21
|
+
@indexes[prefix] += 1
|
22
22
|
else
|
23
|
-
|
23
|
+
if sc::downcase.first.to_s=="true"
|
24
|
+
content.to_s.underscore
|
25
|
+
else
|
26
|
+
content.to_s.wikify
|
27
|
+
end
|
24
28
|
end
|
29
|
+
"#{prefix}#{variable}#{suffix}"
|
25
30
|
end
|
26
31
|
|
27
|
-
new_uri
|
28
|
-
|
29
|
-
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
|
32
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
|
30
33
|
end
|
31
34
|
end
|
32
35
|
end
|
@@ -3,7 +3,7 @@ module Sc
|
|
3
3
|
def filter doc
|
4
4
|
if sc::attribute.first
|
5
5
|
# Select node's attribute if given
|
6
|
-
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
6
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute], :attribute=>attribute } }
|
7
7
|
else
|
8
8
|
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
|
9
9
|
end
|
@@ -5,7 +5,7 @@ module Sc
|
|
5
5
|
slices = doc[:value].split(separator)
|
6
6
|
sc::index.map { |index| slices[index.to_i].to_s.strip }.
|
7
7
|
select { |value| value != "" }.
|
8
|
-
map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
|
8
|
+
map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value, :attribute=>doc[:attribute]} }
|
9
9
|
end.flatten
|
10
10
|
end
|
11
11
|
end
|
@@ -1,37 +1,58 @@
|
|
1
1
|
module Sc
|
2
2
|
class VisualSelector < Selector
|
3
3
|
def filter doc
|
4
|
+
# By initializing variables, we avoid getting data from a hash (slow)
|
5
|
+
min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
|
6
|
+
max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
|
7
|
+
min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
|
8
|
+
max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
|
9
|
+
min_x = (sc::min_x.first.to_i if sc::min_x.first)
|
10
|
+
max_x = (sc::max_x.first.to_i if sc::max_x.first)
|
11
|
+
min_y = (sc::min_y.first.to_i if sc::min_y.first)
|
12
|
+
max_y = (sc::max_y.first.to_i if sc::max_y.first)
|
13
|
+
min_width = (sc::min_width.first.to_i if sc::min_width.first)
|
14
|
+
max_width = (sc::max_width.first.to_i if sc::max_width.first)
|
15
|
+
min_height = (sc::min_height.first.to_i if sc::min_height.first)
|
16
|
+
max_height = (sc::max_height.first.to_i if sc::max_height.first)
|
17
|
+
min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
|
18
|
+
max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
|
19
|
+
min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
|
20
|
+
max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
|
21
|
+
font_family = sc::font_family.first
|
22
|
+
attributes = sc::attribute
|
23
|
+
formats = sc::format
|
24
|
+
|
4
25
|
doc[:content].search(sc::tag.first || "*").select do |node|
|
5
26
|
relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
|
6
27
|
relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
|
7
28
|
|
8
29
|
!node.text? and
|
9
|
-
( !
|
10
|
-
( !
|
11
|
-
( !
|
12
|
-
( !
|
30
|
+
( !min_relative_x or relative_x >= min_relative_x) and
|
31
|
+
( !max_relative_x or relative_x <= max_relative_x) and
|
32
|
+
( !min_relative_y or relative_y >= min_relative_y) and
|
33
|
+
( !max_relative_y or relative_y <= max_relative_y) and
|
13
34
|
|
14
|
-
( !
|
15
|
-
( !
|
16
|
-
( !
|
17
|
-
( !
|
35
|
+
( !min_x or node['vx'].to_i >= min_x) and
|
36
|
+
( !max_x or node['vx'].to_i <= max_x) and
|
37
|
+
( !min_y or node['vy'].to_i >= min_y) and
|
38
|
+
( !max_y or node['vy'].to_i <= max_y) and
|
18
39
|
|
19
|
-
( !
|
20
|
-
( !
|
21
|
-
( !
|
22
|
-
( !
|
40
|
+
( !min_width or node['vw'].to_i >= min_width) and
|
41
|
+
( !max_width or node['vw'].to_i <= max_width) and
|
42
|
+
( !min_height or node['vh'].to_i >= min_height) and
|
43
|
+
( !max_height or node['vh'].to_i <= max_height) and
|
23
44
|
|
24
|
-
( !
|
25
|
-
( !
|
26
|
-
( !
|
27
|
-
( !
|
28
|
-
( !
|
45
|
+
( !min_font_size or node['vsize'].to_i >= min_font_size) and
|
46
|
+
( !max_font_size or node['vsize'].to_i <= max_font_size) and
|
47
|
+
( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
|
48
|
+
( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
|
49
|
+
( !font_family or node['vfont'] == font_family)
|
29
50
|
end.map do |content|
|
30
|
-
if
|
51
|
+
if attributes.first
|
31
52
|
# Select node's attribute if given
|
32
|
-
|
53
|
+
attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
|
33
54
|
else
|
34
|
-
[ { :uri=>doc[:uri], :content=>content, :value=>format(content,
|
55
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
|
35
56
|
end
|
36
57
|
end.flatten
|
37
58
|
end
|
@@ -11,7 +11,7 @@ module Sc
|
|
11
11
|
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
12
12
|
if sc::attribute.first
|
13
13
|
# Select node's attribute if given
|
14
|
-
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
14
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
|
15
15
|
else
|
16
16
|
# Select node
|
17
17
|
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module Optimizer
|
3
|
+
# Iterates through a knowledge base and tries to merge and generalize
|
4
|
+
# selectors whenever the output of the resulting kb is the same
|
5
|
+
def optimize kb, sample
|
6
|
+
# Get the output only once
|
7
|
+
output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
|
8
|
+
|
9
|
+
# Build an array of fragments
|
10
|
+
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
11
|
+
fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
|
12
|
+
|
13
|
+
# Parse the document
|
14
|
+
doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
|
15
|
+
|
16
|
+
begin
|
17
|
+
changed, fragments = optimize_once fragments, doc, output
|
18
|
+
end until !changed
|
19
|
+
|
20
|
+
graph = RDF::Graph.new
|
21
|
+
fragments.each { |fragment| graph << fragment }
|
22
|
+
|
23
|
+
graph
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
# Tries to optimize a set of fragments.
|
28
|
+
# Returns true if there were changes, false otherwise,
|
29
|
+
# and the new fragments as the second array element
|
30
|
+
def optimize_once fragments, doc, output
|
31
|
+
fragments.each do |fragment1|
|
32
|
+
fragments.each do |fragment2|
|
33
|
+
next if fragment1 == fragment2
|
34
|
+
new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
|
35
|
+
|
36
|
+
# End if a new fragment was created
|
37
|
+
if new_fragment
|
38
|
+
return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
[false, fragments]
|
43
|
+
end
|
44
|
+
|
45
|
+
def mix_if_gain fragment1, fragment2, doc, output
|
46
|
+
# Won't get gain if the fragment does not produce the same kind of RDF resource
|
47
|
+
return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
48
|
+
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
49
|
+
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
50
|
+
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
51
|
+
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
52
|
+
|
53
|
+
# Build new fragment
|
54
|
+
new_fragment = Node(nil)
|
55
|
+
new_fragment.rdf::type = fragment1.rdf::type
|
56
|
+
new_fragment.sc::type = fragment1.sc::type
|
57
|
+
new_fragment.sc::relation = fragment1.sc::relation
|
58
|
+
new_fragment.sc::superclass = fragment1.sc::superclass
|
59
|
+
new_fragment.sc::sameas = fragment1.sc::sameas
|
60
|
+
|
61
|
+
# sc:selector
|
62
|
+
selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
|
63
|
+
new_fragment.graph << selector
|
64
|
+
new_fragment.sc::selector = selector
|
65
|
+
|
66
|
+
# sc:identifier
|
67
|
+
if fragment1.sc::identifier.first
|
68
|
+
selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
|
69
|
+
new_fragment.graph << selector
|
70
|
+
new_fragment.sc::identifier = selector
|
71
|
+
end
|
72
|
+
|
73
|
+
# sc:subfragment
|
74
|
+
all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
|
75
|
+
all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
|
76
|
+
subfragments = all_subfragments.select do |sf|
|
77
|
+
sf.sc::type.sort_by(&:to_s) == types and
|
78
|
+
sf.sc::relation.sort_by(&:to_s) == relations
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Check new output
|
83
|
+
separate_output1 = fragment1.extract_graph :doc=>doc
|
84
|
+
separate_output2 = fragment2.extract_graph :doc=>doc
|
85
|
+
separate_output = separate_output1.merge separate_output2
|
86
|
+
new_output = new_fragment.proxy.extract_graph :doc=>doc
|
87
|
+
|
88
|
+
# Check if the output with the new fragment is a subset of the full output
|
89
|
+
# and if the output of the fragments alone is a subset of the output of the new
|
90
|
+
# fragment. This way we ensure the output is the same without using all the
|
91
|
+
# fragments that are available in the knowledge base.
|
92
|
+
new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
|
93
|
+
end
|
94
|
+
|
95
|
+
def generalize_selectors selectors
|
96
|
+
selector = Node(nil)
|
97
|
+
selector.rdf::type = Node('sc:VisualSelector')
|
98
|
+
selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
|
99
|
+
selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
|
100
|
+
selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
|
101
|
+
selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
|
102
|
+
selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
|
103
|
+
selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
|
104
|
+
selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
|
105
|
+
selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
|
106
|
+
selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
|
107
|
+
selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
|
108
|
+
selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
|
109
|
+
selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
|
110
|
+
selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
|
111
|
+
selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
|
112
|
+
selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
|
113
|
+
selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
|
114
|
+
selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family }.flatten.uniq.size == 1
|
115
|
+
selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag }.flatten.uniq.size == 1
|
116
|
+
selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute }.flatten.uniq.size == 1
|
117
|
+
|
118
|
+
selector
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -6,10 +6,6 @@ module Scrappy
|
|
6
6
|
triples + train_sample(sample).triples
|
7
7
|
end )
|
8
8
|
end
|
9
|
-
|
10
|
-
# Optimizes the knowledge base by generalizing patterns
|
11
|
-
def optimize
|
12
|
-
end
|
13
9
|
|
14
10
|
private
|
15
11
|
def train_sample sample
|
@@ -34,37 +30,31 @@ module Scrappy
|
|
34
30
|
fragment.graph << selector
|
35
31
|
fragment.sc::selector = selector
|
36
32
|
when ID("sc:uri") then
|
37
|
-
# Assumption: URIs are extracted from a link
|
38
33
|
selector = selector_for(node.sc::uri.first.sc::source.first, node)
|
39
|
-
selector.sc::tag = "a"
|
40
|
-
selector.sc::attribute = "href"
|
41
|
-
|
42
34
|
fragment.graph << selector
|
43
35
|
fragment.sc::identifier = selector
|
44
36
|
when ID("rdf:type") then
|
45
37
|
fragment.sc::type = node.rdf::type
|
46
38
|
else
|
47
|
-
if node[predicate].map(&:class).uniq.first
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
graph = RDF::Graph.new( subfragments.inject([]) do |triples, subfragment|
|
52
|
-
triples + subfragment.graph.triples.map { |s,p,o| [s==subfragment.id ? id : s,p,o] }
|
53
|
-
end )
|
54
|
-
subfragment = graph[id]
|
55
|
-
subfragment.sc::relation = Node(predicate)
|
56
|
-
subfragment.sc::min_cardinality = "1"
|
39
|
+
if node[predicate].map(&:class).uniq.first == RDF::Node
|
40
|
+
node[predicate].map do |subnode|
|
41
|
+
subfragment = fragment_for(subnode, node)
|
42
|
+
subfragment.sc::relation = Node(predicate)
|
57
43
|
|
58
|
-
|
59
|
-
|
44
|
+
fragment.graph << subfragment
|
45
|
+
fragment.sc::subfragment += [subfragment]
|
46
|
+
end
|
60
47
|
end
|
61
48
|
end
|
62
49
|
end
|
63
|
-
fragment.rdf::type = Node("sc:Fragment")
|
50
|
+
fragment.rdf::type = Node("sc:Fragment")
|
51
|
+
fragment.sc::min_cardinality = "1"
|
52
|
+
fragment.sc::max_cardinality = "1"
|
64
53
|
fragment
|
65
54
|
end
|
66
55
|
|
67
56
|
def selector_for fragment, parent=nil
|
57
|
+
fragment_selector = fragment.sc::selector.first
|
68
58
|
presentation = fragment.sc::presentation.first
|
69
59
|
|
70
60
|
selector = Node(nil)
|
@@ -94,6 +84,9 @@ module Scrappy
|
|
94
84
|
selector.sc::min_font_weight = presentation.sc::font_weight
|
95
85
|
selector.sc::max_font_weight = presentation.sc::font_weight
|
96
86
|
selector.sc::font_family = presentation.sc::font_family
|
87
|
+
|
88
|
+
selector.sc::tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
|
89
|
+
selector.sc::attribute = fragment_selector.sc::attribute
|
97
90
|
|
98
91
|
selector
|
99
92
|
end
|
data/lib/scrappy/server/admin.rb
CHANGED
@@ -62,7 +62,7 @@ module Scrappy
|
|
62
62
|
map { |node| node.sc::type }.flatten.map(&:to_s).sort
|
63
63
|
haml :patterns
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
app.delete '/patterns/*' do |uri|
|
67
67
|
Scrappy::App.delete_pattern uri
|
68
68
|
flash[:notice] = "Pattern deleted"
|
@@ -94,6 +94,12 @@ module Scrappy
|
|
94
94
|
redirect "#{settings.base_uri}/samples"
|
95
95
|
end
|
96
96
|
|
97
|
+
app.post '/samples/:id/optimize' do |id|
|
98
|
+
Scrappy::App.save_patterns agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
|
99
|
+
flash[:notice] = "Optimization completed"
|
100
|
+
redirect "#{settings.base_uri}/samples"
|
101
|
+
end
|
102
|
+
|
97
103
|
app.post '/samples' do
|
98
104
|
html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
|
99
105
|
sample = Scrappy::App.add_sample(:html=>html, :uri=>params[:uri], :date=>Time.now)
|
data/lib/scrappy/support.rb
CHANGED
@@ -29,4 +29,28 @@ class String
|
|
29
29
|
tr("-", "_").
|
30
30
|
downcase
|
31
31
|
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class Array
|
35
|
+
# Return true if a given array has the same elements as this one
|
36
|
+
def equivalent? array
|
37
|
+
self.all? { |i| array.include?(i) } and
|
38
|
+
array.all? { |i| self.include?(i) }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module RDF
|
43
|
+
class Node
|
44
|
+
def self.mix *nodes
|
45
|
+
id = nodes.first
|
46
|
+
graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
|
47
|
+
triples + node.graph.triples.map do |s,p,o|
|
48
|
+
[ s==node.id ? id : s,
|
49
|
+
p==node.id ? id : p,
|
50
|
+
o==node.id ? id : o ]
|
51
|
+
end
|
52
|
+
end )
|
53
|
+
graph[id]
|
54
|
+
end
|
55
|
+
end
|
32
56
|
end
|
data/lib/scrappy.rb
CHANGED
@@ -15,7 +15,8 @@ require 'scrappy/support'
|
|
15
15
|
require 'scrappy/repository'
|
16
16
|
|
17
17
|
require 'scrappy/extractor/extractor'
|
18
|
-
require 'scrappy/
|
18
|
+
require 'scrappy/learning/trainer'
|
19
|
+
require 'scrappy/learning/optimizer'
|
19
20
|
require 'scrappy/agent/map_reduce'
|
20
21
|
require 'scrappy/agent/cache'
|
21
22
|
require 'scrappy/agent/dumper'
|
@@ -23,5 +24,5 @@ require 'scrappy/agent/blind_agent'
|
|
23
24
|
require 'scrappy/agent/agent'
|
24
25
|
|
25
26
|
module Scrappy
|
26
|
-
VERSION = '0.3.
|
27
|
+
VERSION = '0.3.3'
|
27
28
|
end
|
@@ -16,16 +16,28 @@ var add_visual_data = function() {
|
|
16
16
|
item.setAttribute('vy', y);
|
17
17
|
item.setAttribute('vw', item.offsetWidth);
|
18
18
|
item.setAttribute('vh', item.offsetHeight);
|
19
|
-
|
20
|
-
|
21
|
-
item.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
19
|
+
|
20
|
+
var item_with_text = false;
|
21
|
+
for (var k=0; k<item.childNodes.length; k++) {
|
22
|
+
child = item.childNodes[k]
|
23
|
+
if (child.nodeName == "#text" && child.textContent.trim() != "") {
|
24
|
+
item_with_text = true;
|
25
|
+
break;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
if (item_with_text) {
|
30
|
+
var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
|
31
|
+
size = size.substring(0, size.length-2);
|
32
|
+
item.setAttribute('vsize', size);
|
33
|
+
var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
|
34
|
+
var font = fonts[fonts.length-1].trim();
|
35
|
+
item.setAttribute('vfont', font);
|
36
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
37
|
+
if (weight == 'normal') weight = 400;
|
38
|
+
if (weight == 'bold') weight = 700;
|
39
|
+
item.setAttribute('vweight', weight);
|
40
|
+
}
|
29
41
|
}
|
30
42
|
}
|
31
43
|
|
data/scrappy.gemspec
CHANGED
@@ -2,30 +2,30 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-25}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
s.rubyforge_project = %q{scrappy}
|
20
|
-
s.rubygems_version = %q{1.3.
|
20
|
+
s.rubygems_version = %q{1.3.6}
|
21
21
|
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
22
|
-
s.test_files = ["test/
|
22
|
+
s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
|
23
23
|
|
24
24
|
if s.respond_to? :specification_version then
|
25
25
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
26
|
s.specification_version = 3
|
27
27
|
|
28
|
-
if Gem::Version.new(Gem::
|
28
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
29
29
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
|
30
30
|
s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
|
31
31
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
data/views/samples.haml
CHANGED
@@ -21,6 +21,8 @@
|
|
21
21
|
-[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
|
22
22
|
%span.format
|
23
23
|
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
24
|
+
%span.format
|
25
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/optimize", :'data-method'=>:post} Optimize
|
24
26
|
%span.format
|
25
27
|
%a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
|
26
28
|
%span.date
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrappy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 23
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
8
|
+
- 3
|
9
|
+
version: 0.3.3
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jose Ignacio
|
@@ -15,18 +14,16 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-25 00:00:00 +01:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: activesupport
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
24
|
requirements:
|
27
25
|
- - ">="
|
28
26
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 9
|
30
27
|
segments:
|
31
28
|
- 2
|
32
29
|
- 3
|
@@ -38,11 +35,9 @@ dependencies:
|
|
38
35
|
name: sinatra
|
39
36
|
prerelease: false
|
40
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
38
|
requirements:
|
43
39
|
- - ">="
|
44
40
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 23
|
46
41
|
segments:
|
47
42
|
- 1
|
48
43
|
- 1
|
@@ -54,11 +49,9 @@ dependencies:
|
|
54
49
|
name: thin
|
55
50
|
prerelease: false
|
56
51
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
52
|
requirements:
|
59
53
|
- - ">="
|
60
54
|
- !ruby/object:Gem::Version
|
61
|
-
hash: 17
|
62
55
|
segments:
|
63
56
|
- 1
|
64
57
|
- 2
|
@@ -70,11 +63,9 @@ dependencies:
|
|
70
63
|
name: nokogiri
|
71
64
|
prerelease: false
|
72
65
|
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
66
|
requirements:
|
75
67
|
- - ">="
|
76
68
|
- !ruby/object:Gem::Version
|
77
|
-
hash: 5
|
78
69
|
segments:
|
79
70
|
- 1
|
80
71
|
- 4
|
@@ -86,11 +77,9 @@ dependencies:
|
|
86
77
|
name: mechanize
|
87
78
|
prerelease: false
|
88
79
|
requirement: &id005 !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
80
|
requirements:
|
91
81
|
- - ">="
|
92
82
|
- !ruby/object:Gem::Version
|
93
|
-
hash: 23
|
94
83
|
segments:
|
95
84
|
- 1
|
96
85
|
- 0
|
@@ -102,11 +91,9 @@ dependencies:
|
|
102
91
|
name: lightrdf
|
103
92
|
prerelease: false
|
104
93
|
requirement: &id006 !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
94
|
requirements:
|
107
95
|
- - ">="
|
108
96
|
- !ruby/object:Gem::Version
|
109
|
-
hash: 19
|
110
97
|
segments:
|
111
98
|
- 0
|
112
99
|
- 3
|
@@ -118,11 +105,9 @@ dependencies:
|
|
118
105
|
name: i18n
|
119
106
|
prerelease: false
|
120
107
|
requirement: &id007 !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
108
|
requirements:
|
123
109
|
- - ">="
|
124
110
|
- !ruby/object:Gem::Version
|
125
|
-
hash: 11
|
126
111
|
segments:
|
127
112
|
- 0
|
128
113
|
- 4
|
@@ -134,11 +119,9 @@ dependencies:
|
|
134
119
|
name: rest-client
|
135
120
|
prerelease: false
|
136
121
|
requirement: &id008 !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
122
|
requirements:
|
139
123
|
- - ">="
|
140
124
|
- !ruby/object:Gem::Version
|
141
|
-
hash: 13
|
142
125
|
segments:
|
143
126
|
- 1
|
144
127
|
- 6
|
@@ -150,11 +133,9 @@ dependencies:
|
|
150
133
|
name: haml
|
151
134
|
prerelease: false
|
152
135
|
requirement: &id009 !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
136
|
requirements:
|
155
137
|
- - ">="
|
156
138
|
- !ruby/object:Gem::Version
|
157
|
-
hash: 55
|
158
139
|
segments:
|
159
140
|
- 3
|
160
141
|
- 0
|
@@ -166,11 +147,9 @@ dependencies:
|
|
166
147
|
name: rack-flash
|
167
148
|
prerelease: false
|
168
149
|
requirement: &id010 !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
150
|
requirements:
|
171
151
|
- - ">="
|
172
152
|
- !ruby/object:Gem::Version
|
173
|
-
hash: 25
|
174
153
|
segments:
|
175
154
|
- 0
|
176
155
|
- 1
|
@@ -187,6 +166,7 @@ extensions: []
|
|
187
166
|
extra_rdoc_files:
|
188
167
|
- README.rdoc
|
189
168
|
- bin/scrappy
|
169
|
+
- extractors/elmundo.yarf
|
190
170
|
- lib/scrappy.rb
|
191
171
|
- lib/scrappy/agent/agent.rb
|
192
172
|
- lib/scrappy/agent/blind_agent.rb
|
@@ -207,20 +187,21 @@ extra_rdoc_files:
|
|
207
187
|
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
208
188
|
- lib/scrappy/extractor/selectors/visual.rb
|
209
189
|
- lib/scrappy/extractor/selectors/xpath.rb
|
190
|
+
- lib/scrappy/learning/optimizer.rb
|
191
|
+
- lib/scrappy/learning/trainer.rb
|
210
192
|
- lib/scrappy/repository.rb
|
211
193
|
- lib/scrappy/server/admin.rb
|
212
194
|
- lib/scrappy/server/errors.rb
|
213
195
|
- lib/scrappy/server/helpers.rb
|
214
196
|
- lib/scrappy/server/server.rb
|
215
197
|
- lib/scrappy/support.rb
|
216
|
-
- lib/scrappy/trainer/trainer.rb
|
217
198
|
files:
|
218
199
|
- History.txt
|
219
200
|
- Manifest
|
220
201
|
- README.rdoc
|
221
202
|
- Rakefile
|
222
203
|
- bin/scrappy
|
223
|
-
-
|
204
|
+
- extractors/elmundo.yarf
|
224
205
|
- lib/scrappy.rb
|
225
206
|
- lib/scrappy/agent/agent.rb
|
226
207
|
- lib/scrappy/agent/blind_agent.rb
|
@@ -241,13 +222,14 @@ files:
|
|
241
222
|
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
242
223
|
- lib/scrappy/extractor/selectors/visual.rb
|
243
224
|
- lib/scrappy/extractor/selectors/xpath.rb
|
225
|
+
- lib/scrappy/learning/optimizer.rb
|
226
|
+
- lib/scrappy/learning/trainer.rb
|
244
227
|
- lib/scrappy/repository.rb
|
245
228
|
- lib/scrappy/server/admin.rb
|
246
229
|
- lib/scrappy/server/errors.rb
|
247
230
|
- lib/scrappy/server/helpers.rb
|
248
231
|
- lib/scrappy/server/server.rb
|
249
232
|
- lib/scrappy/support.rb
|
250
|
-
- lib/scrappy/trainer/trainer.rb
|
251
233
|
- public/favicon.ico
|
252
234
|
- public/images/logo.png
|
253
235
|
- public/images/logo_tiny.png
|
@@ -278,20 +260,16 @@ rdoc_options:
|
|
278
260
|
require_paths:
|
279
261
|
- lib
|
280
262
|
required_ruby_version: !ruby/object:Gem::Requirement
|
281
|
-
none: false
|
282
263
|
requirements:
|
283
264
|
- - ">="
|
284
265
|
- !ruby/object:Gem::Version
|
285
|
-
hash: 3
|
286
266
|
segments:
|
287
267
|
- 0
|
288
268
|
version: "0"
|
289
269
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
290
|
-
none: false
|
291
270
|
requirements:
|
292
271
|
- - ">="
|
293
272
|
- !ruby/object:Gem::Version
|
294
|
-
hash: 11
|
295
273
|
segments:
|
296
274
|
- 1
|
297
275
|
- 2
|
@@ -299,10 +277,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
299
277
|
requirements: []
|
300
278
|
|
301
279
|
rubyforge_project: scrappy
|
302
|
-
rubygems_version: 1.3.
|
280
|
+
rubygems_version: 1.3.6
|
303
281
|
signing_key:
|
304
282
|
specification_version: 3
|
305
283
|
summary: Web scraper that allows producing RDF data out of plain web pages
|
306
284
|
test_files:
|
307
|
-
- test/test_helper.rb
|
308
285
|
- test/test_scrappy.rb
|
286
|
+
- test/test_helper.rb
|