scrappy 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,10 @@
1
+ === 0.3.3 2011-03-25
2
+
3
+ * Fix in NewUriSelector
4
+ * Improved extraction process
5
+ * Removed -R option
6
+ * Removed irrelevant references to base URIs
7
+
1
8
  === 0.3.2 2011-03-18
2
9
 
3
10
  * Correction of issue with certain Ruby versions
data/Manifest CHANGED
@@ -3,7 +3,7 @@ Manifest
3
3
  README.rdoc
4
4
  Rakefile
5
5
  bin/scrappy
6
- kb/elmundo.yarf
6
+ extractors/elmundo.yarf
7
7
  lib/scrappy.rb
8
8
  lib/scrappy/agent/agent.rb
9
9
  lib/scrappy/agent/blind_agent.rb
@@ -24,13 +24,14 @@ lib/scrappy/extractor/selectors/uri.rb
24
24
  lib/scrappy/extractor/selectors/uri_pattern.rb
25
25
  lib/scrappy/extractor/selectors/visual.rb
26
26
  lib/scrappy/extractor/selectors/xpath.rb
27
+ lib/scrappy/learning/optimizer.rb
28
+ lib/scrappy/learning/trainer.rb
27
29
  lib/scrappy/repository.rb
28
30
  lib/scrappy/server/admin.rb
29
31
  lib/scrappy/server/errors.rb
30
32
  lib/scrappy/server/helpers.rb
31
33
  lib/scrappy/server/server.rb
32
34
  lib/scrappy/support.rb
33
- lib/scrappy/trainer/trainer.rb
34
35
  public/favicon.ico
35
36
  public/images/logo.png
36
37
  public/images/logo_tiny.png
data/bin/scrappy CHANGED
@@ -40,8 +40,7 @@ module Scrappy
40
40
  opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
41
41
  opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
42
42
  opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
43
- opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
44
- opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
43
+ opts.on('-r', '--reference') { Agent::Options.referenceable = true }
45
44
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
46
45
  opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
47
46
  opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
@@ -106,9 +105,12 @@ module Scrappy
106
105
  end
107
106
  def self.add_pattern graph
108
107
  new_patterns = Scrappy::Kb.patterns.merge graph
109
- open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
108
+ save_patterns new_patterns
110
109
  onload
111
110
  end
111
+ def self.save_patterns new_patterns
112
+ open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
113
+ end
112
114
  def self.delete_pattern uri
113
115
  graph = Scrappy::Kb.patterns
114
116
  fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
@@ -169,8 +171,7 @@ Options
169
171
  -a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
170
172
  -P, --port PORT Selects port number (default is 3434)
171
173
  -t, --time TIME Returns repository data from the last given minutes
172
- -r, --reference Outputs referenceable data
173
- -R, --reference-all Outputs all HTML referenceable data
174
+ -r, --reference Outputs reference information
174
175
 
175
176
  Authors
176
177
  José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
File without changes
@@ -3,6 +3,7 @@ module Scrappy
3
3
  include MonitorMixin
4
4
  include Extractor
5
5
  include Trainer
6
+ include Optimizer
6
7
  include MapReduce
7
8
  include Cached
8
9
  include BlindAgent
@@ -28,9 +28,6 @@ module Scrappy
28
28
  end
29
29
  end
30
30
 
31
- # Add references to sources if requested
32
- triples += add_referenceable_data uri, content, triples, referenceable if referenceable
33
-
34
31
  puts "done!" if self.options.debug
35
32
 
36
33
  triples
@@ -38,71 +35,24 @@ module Scrappy
38
35
  end
39
36
 
40
37
  def fragments_for kb, uri
41
- uri_selectors = ( kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
42
- kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
43
- flatten.select do |uri_selector|
44
- !kb.node(uri_selector).filter(:uri=>uri).empty?
38
+ root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
39
+
40
+ selectors = []
41
+ fragments = {}
42
+ root_fragments.each do |fragment|
43
+ fragment.sc::selector.each do |selector|
44
+ fragments[selector] = fragment
45
+ selectors << selector
46
+ end
45
47
  end
46
-
47
- visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
48
-
49
- selectors = uri_selectors + visual_selectors
50
-
51
- selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
52
- flatten.
53
- select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
54
- end
55
-
56
- private
57
- def add_referenceable_data uri, content, given_triples, referenceable
58
- triples = []
59
- resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
60
48
 
61
- fragment = Node(Extractor.node_hash(uri, '/'))
62
- selector = Node(nil)
63
- presentation = Node(nil)
64
-
65
- selector.rdf::type = Node('sc:UnivocalSelector')
66
- selector.sc::path = '/'
67
- selector.sc::document = uri
49
+ uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
50
+ selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
51
+ select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }
68
52
 
69
- fragment.sc::selector = selector
70
-
71
- triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
72
-
73
- content.search('*').each do |node|
74
- next if node.text?
75
-
76
- fragment = Extractor.node_hash(uri, node.path)
77
-
78
- if referenceable == :dump or resources[fragment]
79
- selector = ID(nil)
80
- presentation = ID(nil)
81
-
82
- triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
83
- triples << [selector, ID('sc:path'), node.path.to_s]
84
- triples << [selector, ID('sc:tag'), node.name.to_s]
85
- triples << [selector, ID('sc:document'), uri]
86
-
87
- triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
88
- triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
89
- triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
90
- triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
91
- triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
92
- triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
93
- triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
94
- triples << [presentation, ID('sc:text'), node.text.strip]
95
-
96
- triples << [fragment, ID('sc:selector'), selector]
97
- triples << [fragment, ID('sc:presentation'), presentation]
98
- end
99
- end
100
- triples
101
- end
53
+ visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
102
54
 
103
- def self.node_hash uri, path
104
- digest = Digest::MD5.hexdigest("#{uri} #{path}")
105
- :"_:bnode#{digest}"
55
+ (uri_selectors + visual_selectors).map { |selector| fragments[selector] }
106
56
  end
107
57
  end
108
58
  end
@@ -2,101 +2,126 @@ module Sc
2
2
  class Fragment
3
3
  include RDF::NodeProxy
4
4
 
5
+ # Extracts data out of a document and returns an RDF::Graph
6
+ def extract_graph options={}
7
+ graph = RDF::Graph.new
8
+ extract(options).each { |node| graph << node }
9
+ graph
10
+ end
11
+
12
+ # Extracts data out of a document and returns an array of nodes
5
13
  def extract options={}
6
- uri = options[:doc][:uri]
14
+ # Extracts all the mappings and any subfragment
15
+ mappings(options).map do |result|
16
+ node = result[:node]
17
+ subfragments = result[:subfragments]
18
+ doc = result[:doc]
19
+
20
+ # Process subfragments
21
+ consistent = true
22
+ subfragments.each do |subfragment|
23
+ # Get subfragment object
24
+ subfragment = subfragment.proxy Node('sc:Fragment')
25
+
26
+ # Extract data from the subfragment
27
+ subnodes = subfragment.extract(options.merge(:doc=>doc))
28
+
29
+ # Add relations
30
+ subnodes.each do |subnode|
31
+ node.graph << subnode if subnode.is_a?(RDF::Node)
32
+ subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
33
+ end
34
+
35
+ # Check consistency
36
+ consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
37
+ consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
38
+ end
39
+
40
+ # Skip the node if it has inconsistent relations
41
+ # For example: extracting a sioc:Post with no dc:title would
42
+ # violate the constraint sc:min_cardinality = 1
43
+ next if !consistent
44
+
45
+ node
46
+ end.compact
47
+ end
7
48
 
49
+ # Returns all the mappings between this fragment and RDF nodes
50
+ def mappings options
8
51
  # Identify the fragment's mappings
9
52
  docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
10
53
 
11
- # Generate nodes for each page mapping
54
+ # Generate a result for each page mapping
12
55
  docs.map do |doc|
13
56
  # Build RDF nodes from identifier selectors (if present)
14
- nodes = self.nodes(uri, doc, options[:referenceable])
57
+ node = build_node(doc, options[:referenceable])
15
58
 
16
- # Add info to each node
17
- nodes.map do |node|
18
- # Build the object -- it can be a node or a literal
19
- object = if sc::type.include?(Node('rdf:Literal'))
20
- value = doc[:value].to_s.strip
21
- if options[:referenceable]
22
- node.rdf::value = value
23
- node.rdf::type = Node('rdf:Literal')
24
- node
25
- else
26
- value
27
- end
28
- else
29
- # Add statements about the node
30
- sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
31
- sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
32
- sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
59
+ # Skip the node if no URI or bnode is created
60
+ next if !node
61
+
62
+ # Add info to the node
33
63
 
64
+ # Build the object -- it can be a node or a literal
65
+ object = if sc::type.include?(Node('rdf:Literal'))
66
+ value = doc[:value].to_s.strip
67
+ if options[:referenceable]
68
+ node.rdf::value = value
69
+ node.rdf::type = Node('rdf:Literal')
34
70
  node
71
+ else
72
+ value
35
73
  end
74
+ else
75
+ # Add statements about the node
76
+ sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
77
+ sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
78
+ sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
36
79
 
37
- # Process subfragments
38
- consistent = true
39
- sc::subfragment.each do |subfragment|
40
- # Get subfragment object
41
- subfragment = graph.node(subfragment, Node('sc:Fragment'))
42
- # Extract data from the subfragment
43
- subnodes = subfragment.extract(options.merge(:doc=>doc))
44
-
45
- # Add relations
46
- subnodes.each do |subnode|
47
- node.graph << subnode if subnode.is_a?(RDF::Node)
48
- subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
49
- end
50
-
51
- # Check consistency
52
- consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
53
- consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
54
- end
80
+ node
81
+ end
55
82
 
56
- # Skip the node if it has inconsistent relations
57
- # For example: extracting a sioc:Post with no dc:title would
58
- # violate the constraint sc:min_cardinality = 1
59
- next if !consistent
60
-
61
- # Add referenceable data if requested
62
- if options[:referenceable]
63
- sources = [doc[:content]].flatten.map { |n| Node(Scrappy::Extractor.node_hash(doc[:uri], n.path)) }
64
- sources.each do |source|
65
- sc::type.each { |type| source.sc::type += [type] }
66
- sc::relation.each { |relation| source.sc::relation += [relation] }
67
- node.graph << source
68
- node.sc::source += [source]
69
- end
70
- end
71
-
72
- # Object points to either the node or the literal
73
- object
83
+ # Add referenceable data if requested
84
+ if options[:referenceable] and node.size > 0
85
+ source = reference(doc)
86
+ source.sc::type = sc::type
87
+ source.sc::superclass = sc::superclass
88
+ source.sc::sameas = sc::sameas
89
+ source.sc::relation = sc::relation
90
+ node.graph << source
91
+ node.sc::source = source
74
92
  end
75
- end.flatten.compact
93
+
94
+ # Variable object points to either a node or a literal
95
+ # Return the object, as well as its subfragments (if any)
96
+ # and the doc it was extracted from
97
+ { :node=>object, :subfragments=>sc::subfragment, :doc=>doc }
98
+ end.compact
76
99
  end
77
100
 
78
- def nodes uri, doc, referenceable
79
- nodes = sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
80
- node = Node(parse_uri(uri, d[:value]))
101
+ private
102
+ # Builds a node given a document
103
+ def build_node doc, referenceable
104
+ return Node(nil) if sc::identifier.empty?
105
+
106
+ sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
107
+ node = Node(parse_uri(d[:uri], d[:value]))
81
108
 
82
109
  if referenceable
83
110
  # Include the fragment where the URI was built from
84
- uri_node = Node(nil, node.graph)
85
- hash = Scrappy::Extractor.node_hash(d[:uri], d[:content].path)
86
-
87
- node.sc::uri = uri_node
111
+ uri_node = Node(nil)
112
+ source = reference(d)
113
+ uri_node.graph << source
88
114
  uri_node.rdf::value = node.to_s
89
- uri_node.sc::source = Node(hash)
115
+ uri_node.sc::source = source
116
+
117
+ node.graph << uri_node
118
+ node.sc::uri = uri_node
90
119
  end
91
120
 
92
121
  node
93
- end
94
- nodes << Node(nil) if nodes.empty?
95
-
96
- nodes
122
+ end.first
97
123
  end
98
124
 
99
- private
100
125
  # Parses a URI by resolving relative paths
101
126
  def parse_uri(uri, rel_uri)
102
127
  return ID('*') if rel_uri.nil?
@@ -107,5 +132,41 @@ module Sc
107
132
  end
108
133
  end
109
134
 
135
+ # Builds an RDF reference to an HTML node
136
+ def reference doc
137
+ node = doc[:content].is_a?(Nokogiri::XML::NodeSet) ? doc[:content].first.parent : doc[:content]
138
+ attribute = doc[:attribute]
139
+ uri = doc[:uri]
140
+
141
+ source = Node(nil)
142
+ selector = Node(nil)
143
+ presentation = Node(nil)
144
+
145
+ source.graph << selector
146
+ source.sc::selector = selector
147
+
148
+ selector.rdf::type = Node('sc:UnivocalSelector')
149
+ selector.sc::path = node.path
150
+ selector.sc::document = uri
151
+ selector.sc::attribute = attribute if attribute
152
+
153
+ if node.path != '/'
154
+ selector.sc::tag = node.name
155
+ source.graph << presentation
156
+ source.sc::presentation = presentation
157
+ end
158
+
159
+ presentation.sc::x = node[:vx] if node[:vx]
160
+ presentation.sc::y = node[:vy] if node[:vy]
161
+ presentation.sc::width = node[:vw] if node[:vw]
162
+ presentation.sc::height = node[:vh] if node[:vh]
163
+ presentation.sc::font_size = node[:vsize] if node[:vsize]
164
+ presentation.sc::font_family = node[:vfont] if node[:vfont]
165
+ presentation.sc::font_weight = node[:vweight] if node[:vweight]
166
+ presentation.sc::text = node.text.strip
167
+
168
+ source
169
+ end
170
+
110
171
  end
111
172
  end
@@ -3,30 +3,33 @@ module Sc
3
3
  def filter doc
4
4
  contents = if sc::attribute.first
5
5
  # Select node's attribute if given
6
- sc::attribute.map { |attribute| doc[:content][attribute] }
6
+ sc::attribute.map { |attribute| [doc[:content][attribute], attribute] }
7
7
  else
8
- [ doc[:value] ]
8
+ [ [doc[:value], nil] ]
9
9
  end
10
10
 
11
11
  @indexes ||= Hash.new(0)
12
12
  prefix = sc::prefix.first.to_s
13
- prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
13
+ prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
14
14
  suffix = sc::suffix.first.to_s
15
15
 
16
- contents.map do |content|
17
- variable = if sc::sequence.first.to_s=="true"
18
- @indexes[prefix] += 1
16
+ contents.map do |content, attribute|
17
+ new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
18
+ "#{content}#{suffix}"
19
19
  else
20
- if sc::downcase.first.to_s=="true"
21
- content.to_s.underscore
20
+ variable = if sc::sequence.first.to_s=="true"
21
+ @indexes[prefix] += 1
22
22
  else
23
- content.to_s.wikify
23
+ if sc::downcase.first.to_s=="true"
24
+ content.to_s.underscore
25
+ else
26
+ content.to_s.wikify
27
+ end
24
28
  end
29
+ "#{prefix}#{variable}#{suffix}"
25
30
  end
26
31
 
27
- new_uri = "#{prefix}#{variable}#{suffix}"
28
-
29
- { :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
32
+ { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
30
33
  end
31
34
  end
32
35
  end
@@ -3,7 +3,7 @@ module Sc
3
3
  def filter doc
4
4
  if sc::attribute.first
5
5
  # Select node's attribute if given
6
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
6
+ sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute], :attribute=>attribute } }
7
7
  else
8
8
  [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
9
9
  end
@@ -5,7 +5,7 @@ module Sc
5
5
  slices = doc[:value].split(separator)
6
6
  sc::index.map { |index| slices[index.to_i].to_s.strip }.
7
7
  select { |value| value != "" }.
8
- map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
8
+ map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value, :attribute=>doc[:attribute]} }
9
9
  end.flatten
10
10
  end
11
11
  end
@@ -1,37 +1,58 @@
1
1
  module Sc
2
2
  class VisualSelector < Selector
3
3
  def filter doc
4
+ # By initializing variables, we avoid getting data from a hash (slow)
5
+ min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
6
+ max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
7
+ min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
8
+ max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
9
+ min_x = (sc::min_x.first.to_i if sc::min_x.first)
10
+ max_x = (sc::max_x.first.to_i if sc::max_x.first)
11
+ min_y = (sc::min_y.first.to_i if sc::min_y.first)
12
+ max_y = (sc::max_y.first.to_i if sc::max_y.first)
13
+ min_width = (sc::min_width.first.to_i if sc::min_width.first)
14
+ max_width = (sc::max_width.first.to_i if sc::max_width.first)
15
+ min_height = (sc::min_height.first.to_i if sc::min_height.first)
16
+ max_height = (sc::max_height.first.to_i if sc::max_height.first)
17
+ min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
18
+ max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
19
+ min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
20
+ max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
21
+ font_family = sc::font_family.first
22
+ attributes = sc::attribute
23
+ formats = sc::format
24
+
4
25
  doc[:content].search(sc::tag.first || "*").select do |node|
5
26
  relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
6
27
  relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
7
28
 
8
29
  !node.text? and
9
- ( !sc::min_relative_x.first or relative_x >= sc::min_relative_x.first.to_i) and
10
- ( !sc::max_relative_x.first or relative_x <= sc::max_relative_x.first.to_i) and
11
- ( !sc::min_relative_y.first or relative_y >= sc::min_relative_y.first.to_i) and
12
- ( !sc::max_relative_y.first or relative_y <= sc::max_relative_y.first.to_i) and
30
+ ( !min_relative_x or relative_x >= min_relative_x) and
31
+ ( !max_relative_x or relative_x <= max_relative_x) and
32
+ ( !min_relative_y or relative_y >= min_relative_y) and
33
+ ( !max_relative_y or relative_y <= max_relative_y) and
13
34
 
14
- ( !sc::min_x.first or node['vx'].to_i >= sc::min_x.first.to_i) and
15
- ( !sc::max_x.first or node['vx'].to_i <= sc::max_x.first.to_i) and
16
- ( !sc::min_y.first or node['vy'].to_i >= sc::min_y.first.to_i) and
17
- ( !sc::max_y.first or node['vy'].to_i <= sc::max_y.first.to_i) and
35
+ ( !min_x or node['vx'].to_i >= min_x) and
36
+ ( !max_x or node['vx'].to_i <= max_x) and
37
+ ( !min_y or node['vy'].to_i >= min_y) and
38
+ ( !max_y or node['vy'].to_i <= max_y) and
18
39
 
19
- ( !sc::min_width.first or node['vw'].to_i >= sc::min_width.first.to_i) and
20
- ( !sc::max_width.first or node['vw'].to_i <= sc::max_width.first.to_i) and
21
- ( !sc::min_height.first or node['vh'].to_i >= sc::min_height.first.to_i) and
22
- ( !sc::max_height.first or node['vh'].to_i <= sc::max_height.first.to_i) and
40
+ ( !min_width or node['vw'].to_i >= min_width) and
41
+ ( !max_width or node['vw'].to_i <= max_width) and
42
+ ( !min_height or node['vh'].to_i >= min_height) and
43
+ ( !max_height or node['vh'].to_i <= max_height) and
23
44
 
24
- ( !sc::min_font_size.first or node['vsize'].to_i >= sc::min_font_size.first.to_i) and
25
- ( !sc::max_font_size.first or node['vsize'].to_i <= sc::max_font_size.first.to_i) and
26
- ( !sc::min_font_weight.first or node['vweight'].to_i >= sc::min_font_weight.first.to_i) and
27
- ( !sc::max_font_weight.first or node['vweight'].to_i <= sc::max_font_weight.first.to_i) and
28
- ( !sc::font_family.first or node['vfont'] == sc::font_family.first)
45
+ ( !min_font_size or node['vsize'].to_i >= min_font_size) and
46
+ ( !max_font_size or node['vsize'].to_i <= max_font_size) and
47
+ ( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
48
+ ( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
49
+ ( !font_family or node['vfont'] == font_family)
29
50
  end.map do |content|
30
- if sc::attribute.first
51
+ if attributes.first
31
52
  # Select node's attribute if given
32
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute] } }
53
+ attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
33
54
  else
34
- [ { :uri=>doc[:uri], :content=>content, :value=>format(content, sc::format, doc[:uri]) } ]
55
+ [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
35
56
  end
36
57
  end.flatten
37
58
  end
@@ -11,7 +11,7 @@ module Sc
11
11
  (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
12
12
  if sc::attribute.first
13
13
  # Select node's attribute if given
14
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
14
+ sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
15
15
  else
16
16
  # Select node
17
17
  [ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
@@ -0,0 +1,121 @@
1
+ module Scrappy
2
+ module Optimizer
3
+ # Iterates through a knowledge base and tries to merge and generalize
4
+ # selectors whenever the output of the resulting kb is the same
5
+ def optimize kb, sample
6
+ # Get the output only once
7
+ output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
8
+
9
+ # Build an array of fragments
10
+ root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
11
+ fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
12
+
13
+ # Parse the document
14
+ doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
15
+
16
+ begin
17
+ changed, fragments = optimize_once fragments, doc, output
18
+ end until !changed
19
+
20
+ graph = RDF::Graph.new
21
+ fragments.each { |fragment| graph << fragment }
22
+
23
+ graph
24
+ end
25
+
26
+ protected
27
+ # Tries to optimize a set of fragments.
28
+ # Returns true if there were changes, false otherwise,
29
+ # and the new fragments as the second array element
30
+ def optimize_once fragments, doc, output
31
+ fragments.each do |fragment1|
32
+ fragments.each do |fragment2|
33
+ next if fragment1 == fragment2
34
+ new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
35
+
36
+ # End if a new fragment was created
37
+ if new_fragment
38
+ return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
39
+ end
40
+ end
41
+ end
42
+ [false, fragments]
43
+ end
44
+
45
+ def mix_if_gain fragment1, fragment2, doc, output
46
+ # Won't get gain if the fragment does not produce the same kind of RDF resource
47
+ return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
48
+ !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
49
+ !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
50
+ !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
51
+ fragment1.sc::identifier.size != fragment2.sc::identifier.size
52
+
53
+ # Build new fragment
54
+ new_fragment = Node(nil)
55
+ new_fragment.rdf::type = fragment1.rdf::type
56
+ new_fragment.sc::type = fragment1.sc::type
57
+ new_fragment.sc::relation = fragment1.sc::relation
58
+ new_fragment.sc::superclass = fragment1.sc::superclass
59
+ new_fragment.sc::sameas = fragment1.sc::sameas
60
+
61
+ # sc:selector
62
+ selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
63
+ new_fragment.graph << selector
64
+ new_fragment.sc::selector = selector
65
+
66
+ # sc:identifier
67
+ if fragment1.sc::identifier.first
68
+ selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
69
+ new_fragment.graph << selector
70
+ new_fragment.sc::identifier = selector
71
+ end
72
+
73
+ # sc:subfragment
74
+ all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
75
+ all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
76
+ subfragments = all_subfragments.select do |sf|
77
+ sf.sc::type.sort_by(&:to_s) == types and
78
+ sf.sc::relation.sort_by(&:to_s) == relations
79
+ end
80
+ end
81
+
82
+ # Check new output
83
+ separate_output1 = fragment1.extract_graph :doc=>doc
84
+ separate_output2 = fragment2.extract_graph :doc=>doc
85
+ separate_output = separate_output1.merge separate_output2
86
+ new_output = new_fragment.proxy.extract_graph :doc=>doc
87
+
88
+ # Check if the output with the new fragment is a subset of the full output
89
+ # and if the output of the fragments alone is a subset of the output of the new
90
+ # fragment. This way we ensure the output is the same without using all the
91
+ # fragments that are available in the knowledge base.
92
+ new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
93
+ end
94
+
95
+ def generalize_selectors selectors
96
+ selector = Node(nil)
97
+ selector.rdf::type = Node('sc:VisualSelector')
98
+ selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
99
+ selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
100
+ selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
101
+ selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
102
+ selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
103
+ selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
104
+ selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
105
+ selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
106
+ selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
107
+ selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
108
+ selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
109
+ selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
110
+ selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
111
+ selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
112
+ selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
113
+ selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
114
+ selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family }.flatten.uniq.size == 1
115
+ selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag }.flatten.uniq.size == 1
116
+ selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute }.flatten.uniq.size == 1
117
+
118
+ selector
119
+ end
120
+ end
121
+ end
@@ -6,10 +6,6 @@ module Scrappy
6
6
  triples + train_sample(sample).triples
7
7
  end )
8
8
  end
9
-
10
- # Optimizes the knowledge base by generalizing patterns
11
- def optimize
12
- end
13
9
 
14
10
  private
15
11
  def train_sample sample
@@ -34,37 +30,31 @@ module Scrappy
34
30
  fragment.graph << selector
35
31
  fragment.sc::selector = selector
36
32
  when ID("sc:uri") then
37
- # Assumption: URIs are extracted from a link
38
33
  selector = selector_for(node.sc::uri.first.sc::source.first, node)
39
- selector.sc::tag = "a"
40
- selector.sc::attribute = "href"
41
-
42
34
  fragment.graph << selector
43
35
  fragment.sc::identifier = selector
44
36
  when ID("rdf:type") then
45
37
  fragment.sc::type = node.rdf::type
46
38
  else
47
- if node[predicate].map(&:class).uniq.first != String
48
- subfragments = node[predicate].map { |subnode| fragment_for(subnode, node) }
49
- # Mix the subfragments
50
- id = subfragments.first
51
- graph = RDF::Graph.new( subfragments.inject([]) do |triples, subfragment|
52
- triples + subfragment.graph.triples.map { |s,p,o| [s==subfragment.id ? id : s,p,o] }
53
- end )
54
- subfragment = graph[id]
55
- subfragment.sc::relation = Node(predicate)
56
- subfragment.sc::min_cardinality = "1"
39
+ if node[predicate].map(&:class).uniq.first == RDF::Node
40
+ node[predicate].map do |subnode|
41
+ subfragment = fragment_for(subnode, node)
42
+ subfragment.sc::relation = Node(predicate)
57
43
 
58
- fragment.graph << subfragment
59
- fragment.sc::subfragment += [subfragment]
44
+ fragment.graph << subfragment
45
+ fragment.sc::subfragment += [subfragment]
46
+ end
60
47
  end
61
48
  end
62
49
  end
63
- fragment.rdf::type = Node("sc:Fragment") if parent.nil?
50
+ fragment.rdf::type = Node("sc:Fragment")
51
+ fragment.sc::min_cardinality = "1"
52
+ fragment.sc::max_cardinality = "1"
64
53
  fragment
65
54
  end
66
55
 
67
56
  def selector_for fragment, parent=nil
57
+ fragment_selector = fragment.sc::selector.first
68
58
  presentation = fragment.sc::presentation.first
69
59
 
70
60
  selector = Node(nil)
@@ -94,6 +84,9 @@ module Scrappy
94
84
  selector.sc::min_font_weight = presentation.sc::font_weight
95
85
  selector.sc::max_font_weight = presentation.sc::font_weight
96
86
  selector.sc::font_family = presentation.sc::font_family
87
+
88
+ selector.sc::tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
89
+ selector.sc::attribute = fragment_selector.sc::attribute
97
90
 
98
91
  selector
99
92
  end
@@ -62,7 +62,7 @@ module Scrappy
62
62
  map { |node| node.sc::type }.flatten.map(&:to_s).sort
63
63
  haml :patterns
64
64
  end
65
-
65
+
66
66
  app.delete '/patterns/*' do |uri|
67
67
  Scrappy::App.delete_pattern uri
68
68
  flash[:notice] = "Pattern deleted"
@@ -94,6 +94,12 @@ module Scrappy
94
94
  redirect "#{settings.base_uri}/samples"
95
95
  end
96
96
 
97
+ app.post '/samples/:id/optimize' do |id|
98
+ Scrappy::App.save_patterns agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
99
+ flash[:notice] = "Optimization completed"
100
+ redirect "#{settings.base_uri}/samples"
101
+ end
102
+
97
103
  app.post '/samples' do
98
104
  html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
99
105
  sample = Scrappy::App.add_sample(:html=>html, :uri=>params[:uri], :date=>Time.now)
@@ -29,4 +29,28 @@ class String
29
29
  tr("-", "_").
30
30
  downcase
31
31
  end
32
+ end
33
+
34
+ class Array
35
+ # Return true if a given array has the same elements as this one
36
+ def equivalent? array
37
+ self.all? { |i| array.include?(i) } and
38
+ array.all? { |i| self.include?(i) }
39
+ end
40
+ end
41
+
42
+ module RDF
43
+ class Node
44
+ def self.mix *nodes
45
+ id = nodes.first
46
+ graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
47
+ triples + node.graph.triples.map do |s,p,o|
48
+ [ s==node.id ? id : s,
49
+ p==node.id ? id : p,
50
+ o==node.id ? id : o ]
51
+ end
52
+ end )
53
+ graph[id]
54
+ end
55
+ end
32
56
  end
data/lib/scrappy.rb CHANGED
@@ -15,7 +15,8 @@ require 'scrappy/support'
15
15
  require 'scrappy/repository'
16
16
 
17
17
  require 'scrappy/extractor/extractor'
18
- require 'scrappy/trainer/trainer'
18
+ require 'scrappy/learning/trainer'
19
+ require 'scrappy/learning/optimizer'
19
20
  require 'scrappy/agent/map_reduce'
20
21
  require 'scrappy/agent/cache'
21
22
  require 'scrappy/agent/dumper'
@@ -23,5 +24,5 @@ require 'scrappy/agent/blind_agent'
23
24
  require 'scrappy/agent/agent'
24
25
 
25
26
  module Scrappy
26
- VERSION = '0.3.2'
27
+ VERSION = '0.3.3'
27
28
  end
@@ -16,16 +16,28 @@ var add_visual_data = function() {
16
16
  item.setAttribute('vy', y);
17
17
  item.setAttribute('vw', item.offsetWidth);
18
18
  item.setAttribute('vh', item.offsetHeight);
19
- var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
20
- size = size.substring(0, size.length-2);
21
- item.setAttribute('vsize', size);
22
- var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
23
- var font = fonts[fonts.length-1].trim();
24
- item.setAttribute('vfont', font);
25
- var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
26
- if (weight == 'normal') weight = 400;
27
- if (weight == 'bold') weight = 700;
28
- item.setAttribute('vweight', weight);
19
+
20
+ var item_with_text = false;
21
+ for (var k=0; k<item.childNodes.length; k++) {
22
+ child = item.childNodes[k]
23
+ if (child.nodeName == "#text" && child.textContent.trim() != "") {
24
+ item_with_text = true;
25
+ break;
26
+ }
27
+ }
28
+
29
+ if (item_with_text) {
30
+ var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
31
+ size = size.substring(0, size.length-2);
32
+ item.setAttribute('vsize', size);
33
+ var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
34
+ var font = fonts[fonts.length-1].trim();
35
+ item.setAttribute('vfont', font);
36
+ var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
37
+ if (weight == 'normal') weight = 400;
38
+ if (weight == 'bold') weight = 700;
39
+ item.setAttribute('vweight', weight);
40
+ }
29
41
  }
30
42
  }
31
43
 
@@ -166,7 +166,7 @@ ul.detail li span.name, ul.detail li span.short_name {
166
166
  font-family: monospace;
167
167
  }
168
168
  ul.detail li span.short_name {
169
- width: 420px;
169
+ width: 350px;
170
170
  }
171
171
  ul.detail li span.format {
172
172
  float: right;
data/scrappy.gemspec CHANGED
@@ -2,30 +2,30 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.3.2"
5
+ s.version = "0.3.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-03-18}
9
+ s.date = %q{2011-03-25}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
18
18
  s.require_paths = ["lib"]
19
19
  s.rubyforge_project = %q{scrappy}
20
- s.rubygems_version = %q{1.3.7}
20
+ s.rubygems_version = %q{1.3.6}
21
21
  s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
22
- s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
22
+ s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
23
23
 
24
24
  if s.respond_to? :specification_version then
25
25
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26
26
  s.specification_version = 3
27
27
 
28
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
28
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29
29
  s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
30
30
  s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
31
31
  s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
data/views/samples.haml CHANGED
@@ -21,6 +21,8 @@
21
21
  -[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
22
22
  %span.format
23
23
  %a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
24
+ %span.format
25
+ %a{:href=>"#{settings.base_uri}/samples/#{i}/optimize", :'data-method'=>:post} Optimize
24
26
  %span.format
25
27
  %a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
26
28
  %span.date
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 3
9
- - 2
10
- version: 0.3.2
8
+ - 3
9
+ version: 0.3.3
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jose Ignacio
@@ -15,18 +14,16 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-03-18 00:00:00 +01:00
17
+ date: 2011-03-25 00:00:00 +01:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: activesupport
23
22
  prerelease: false
24
23
  requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
24
  requirements:
27
25
  - - ">="
28
26
  - !ruby/object:Gem::Version
29
- hash: 9
30
27
  segments:
31
28
  - 2
32
29
  - 3
@@ -38,11 +35,9 @@ dependencies:
38
35
  name: sinatra
39
36
  prerelease: false
40
37
  requirement: &id002 !ruby/object:Gem::Requirement
41
- none: false
42
38
  requirements:
43
39
  - - ">="
44
40
  - !ruby/object:Gem::Version
45
- hash: 23
46
41
  segments:
47
42
  - 1
48
43
  - 1
@@ -54,11 +49,9 @@ dependencies:
54
49
  name: thin
55
50
  prerelease: false
56
51
  requirement: &id003 !ruby/object:Gem::Requirement
57
- none: false
58
52
  requirements:
59
53
  - - ">="
60
54
  - !ruby/object:Gem::Version
61
- hash: 17
62
55
  segments:
63
56
  - 1
64
57
  - 2
@@ -70,11 +63,9 @@ dependencies:
70
63
  name: nokogiri
71
64
  prerelease: false
72
65
  requirement: &id004 !ruby/object:Gem::Requirement
73
- none: false
74
66
  requirements:
75
67
  - - ">="
76
68
  - !ruby/object:Gem::Version
77
- hash: 5
78
69
  segments:
79
70
  - 1
80
71
  - 4
@@ -86,11 +77,9 @@ dependencies:
86
77
  name: mechanize
87
78
  prerelease: false
88
79
  requirement: &id005 !ruby/object:Gem::Requirement
89
- none: false
90
80
  requirements:
91
81
  - - ">="
92
82
  - !ruby/object:Gem::Version
93
- hash: 23
94
83
  segments:
95
84
  - 1
96
85
  - 0
@@ -102,11 +91,9 @@ dependencies:
102
91
  name: lightrdf
103
92
  prerelease: false
104
93
  requirement: &id006 !ruby/object:Gem::Requirement
105
- none: false
106
94
  requirements:
107
95
  - - ">="
108
96
  - !ruby/object:Gem::Version
109
- hash: 19
110
97
  segments:
111
98
  - 0
112
99
  - 3
@@ -118,11 +105,9 @@ dependencies:
118
105
  name: i18n
119
106
  prerelease: false
120
107
  requirement: &id007 !ruby/object:Gem::Requirement
121
- none: false
122
108
  requirements:
123
109
  - - ">="
124
110
  - !ruby/object:Gem::Version
125
- hash: 11
126
111
  segments:
127
112
  - 0
128
113
  - 4
@@ -134,11 +119,9 @@ dependencies:
134
119
  name: rest-client
135
120
  prerelease: false
136
121
  requirement: &id008 !ruby/object:Gem::Requirement
137
- none: false
138
122
  requirements:
139
123
  - - ">="
140
124
  - !ruby/object:Gem::Version
141
- hash: 13
142
125
  segments:
143
126
  - 1
144
127
  - 6
@@ -150,11 +133,9 @@ dependencies:
150
133
  name: haml
151
134
  prerelease: false
152
135
  requirement: &id009 !ruby/object:Gem::Requirement
153
- none: false
154
136
  requirements:
155
137
  - - ">="
156
138
  - !ruby/object:Gem::Version
157
- hash: 55
158
139
  segments:
159
140
  - 3
160
141
  - 0
@@ -166,11 +147,9 @@ dependencies:
166
147
  name: rack-flash
167
148
  prerelease: false
168
149
  requirement: &id010 !ruby/object:Gem::Requirement
169
- none: false
170
150
  requirements:
171
151
  - - ">="
172
152
  - !ruby/object:Gem::Version
173
- hash: 25
174
153
  segments:
175
154
  - 0
176
155
  - 1
@@ -187,6 +166,7 @@ extensions: []
187
166
  extra_rdoc_files:
188
167
  - README.rdoc
189
168
  - bin/scrappy
169
+ - extractors/elmundo.yarf
190
170
  - lib/scrappy.rb
191
171
  - lib/scrappy/agent/agent.rb
192
172
  - lib/scrappy/agent/blind_agent.rb
@@ -207,20 +187,21 @@ extra_rdoc_files:
207
187
  - lib/scrappy/extractor/selectors/uri_pattern.rb
208
188
  - lib/scrappy/extractor/selectors/visual.rb
209
189
  - lib/scrappy/extractor/selectors/xpath.rb
190
+ - lib/scrappy/learning/optimizer.rb
191
+ - lib/scrappy/learning/trainer.rb
210
192
  - lib/scrappy/repository.rb
211
193
  - lib/scrappy/server/admin.rb
212
194
  - lib/scrappy/server/errors.rb
213
195
  - lib/scrappy/server/helpers.rb
214
196
  - lib/scrappy/server/server.rb
215
197
  - lib/scrappy/support.rb
216
- - lib/scrappy/trainer/trainer.rb
217
198
  files:
218
199
  - History.txt
219
200
  - Manifest
220
201
  - README.rdoc
221
202
  - Rakefile
222
203
  - bin/scrappy
223
- - kb/elmundo.yarf
204
+ - extractors/elmundo.yarf
224
205
  - lib/scrappy.rb
225
206
  - lib/scrappy/agent/agent.rb
226
207
  - lib/scrappy/agent/blind_agent.rb
@@ -241,13 +222,14 @@ files:
241
222
  - lib/scrappy/extractor/selectors/uri_pattern.rb
242
223
  - lib/scrappy/extractor/selectors/visual.rb
243
224
  - lib/scrappy/extractor/selectors/xpath.rb
225
+ - lib/scrappy/learning/optimizer.rb
226
+ - lib/scrappy/learning/trainer.rb
244
227
  - lib/scrappy/repository.rb
245
228
  - lib/scrappy/server/admin.rb
246
229
  - lib/scrappy/server/errors.rb
247
230
  - lib/scrappy/server/helpers.rb
248
231
  - lib/scrappy/server/server.rb
249
232
  - lib/scrappy/support.rb
250
- - lib/scrappy/trainer/trainer.rb
251
233
  - public/favicon.ico
252
234
  - public/images/logo.png
253
235
  - public/images/logo_tiny.png
@@ -278,20 +260,16 @@ rdoc_options:
278
260
  require_paths:
279
261
  - lib
280
262
  required_ruby_version: !ruby/object:Gem::Requirement
281
- none: false
282
263
  requirements:
283
264
  - - ">="
284
265
  - !ruby/object:Gem::Version
285
- hash: 3
286
266
  segments:
287
267
  - 0
288
268
  version: "0"
289
269
  required_rubygems_version: !ruby/object:Gem::Requirement
290
- none: false
291
270
  requirements:
292
271
  - - ">="
293
272
  - !ruby/object:Gem::Version
294
- hash: 11
295
273
  segments:
296
274
  - 1
297
275
  - 2
@@ -299,10 +277,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
299
277
  requirements: []
300
278
 
301
279
  rubyforge_project: scrappy
302
- rubygems_version: 1.3.7
280
+ rubygems_version: 1.3.6
303
281
  signing_key:
304
282
  specification_version: 3
305
283
  summary: Web scraper that allows producing RDF data out of plain web pages
306
284
  test_files:
307
- - test/test_helper.rb
308
285
  - test/test_scrappy.rb
286
+ - test/test_helper.rb