scrappy 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,10 @@
1
+ === 0.3.3 2011-03-25
2
+
3
+ * Fix in NewUriSelector
4
+ * Improved extraction process
5
+ * Removed -R option
6
+ * Removed irrelevant references to base URIs
7
+
1
8
  === 0.3.2 2011-03-18
2
9
 
3
10
  * Correction of issue with certain Ruby versions
data/Manifest CHANGED
@@ -3,7 +3,7 @@ Manifest
3
3
  README.rdoc
4
4
  Rakefile
5
5
  bin/scrappy
6
- kb/elmundo.yarf
6
+ extractors/elmundo.yarf
7
7
  lib/scrappy.rb
8
8
  lib/scrappy/agent/agent.rb
9
9
  lib/scrappy/agent/blind_agent.rb
@@ -24,13 +24,14 @@ lib/scrappy/extractor/selectors/uri.rb
24
24
  lib/scrappy/extractor/selectors/uri_pattern.rb
25
25
  lib/scrappy/extractor/selectors/visual.rb
26
26
  lib/scrappy/extractor/selectors/xpath.rb
27
+ lib/scrappy/learning/optimizer.rb
28
+ lib/scrappy/learning/trainer.rb
27
29
  lib/scrappy/repository.rb
28
30
  lib/scrappy/server/admin.rb
29
31
  lib/scrappy/server/errors.rb
30
32
  lib/scrappy/server/helpers.rb
31
33
  lib/scrappy/server/server.rb
32
34
  lib/scrappy/support.rb
33
- lib/scrappy/trainer/trainer.rb
34
35
  public/favicon.ico
35
36
  public/images/logo.png
36
37
  public/images/logo_tiny.png
data/bin/scrappy CHANGED
@@ -40,8 +40,7 @@ module Scrappy
40
40
  opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
41
41
  opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
42
42
  opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
43
- opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
44
- opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
43
+ opts.on('-r', '--reference') { Agent::Options.referenceable = true }
45
44
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
46
45
  opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
47
46
  opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
@@ -106,9 +105,12 @@ module Scrappy
106
105
  end
107
106
  def self.add_pattern graph
108
107
  new_patterns = Scrappy::Kb.patterns.merge graph
109
- open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
108
+ save_patterns new_patterns
110
109
  onload
111
110
  end
111
+ def self.save_patterns new_patterns
112
+ open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
113
+ end
112
114
  def self.delete_pattern uri
113
115
  graph = Scrappy::Kb.patterns
114
116
  fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
@@ -169,8 +171,7 @@ Options
169
171
  -a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
170
172
  -P, --port PORT Selects port number (default is 3434)
171
173
  -t, --time TIME Returns repository data from the last given minutes
172
- -r, --reference Outputs referenceable data
173
- -R, --reference-all Outputs all HTML referenceable data
174
+ -r, --reference Outputs reference information
174
175
 
175
176
  Authors
176
177
  José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
File without changes
@@ -3,6 +3,7 @@ module Scrappy
3
3
  include MonitorMixin
4
4
  include Extractor
5
5
  include Trainer
6
+ include Optimizer
6
7
  include MapReduce
7
8
  include Cached
8
9
  include BlindAgent
@@ -28,9 +28,6 @@ module Scrappy
28
28
  end
29
29
  end
30
30
 
31
- # Add references to sources if requested
32
- triples += add_referenceable_data uri, content, triples, referenceable if referenceable
33
-
34
31
  puts "done!" if self.options.debug
35
32
 
36
33
  triples
@@ -38,71 +35,24 @@ module Scrappy
38
35
  end
39
36
 
40
37
  def fragments_for kb, uri
41
- uri_selectors = ( kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
42
- kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
43
- flatten.select do |uri_selector|
44
- !kb.node(uri_selector).filter(:uri=>uri).empty?
38
+ root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
39
+
40
+ selectors = []
41
+ fragments = {}
42
+ root_fragments.each do |fragment|
43
+ fragment.sc::selector.each do |selector|
44
+ fragments[selector] = fragment
45
+ selectors << selector
46
+ end
45
47
  end
46
-
47
- visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
48
-
49
- selectors = uri_selectors + visual_selectors
50
-
51
- selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
52
- flatten.
53
- select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
54
- end
55
-
56
- private
57
- def add_referenceable_data uri, content, given_triples, referenceable
58
- triples = []
59
- resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
60
48
 
61
- fragment = Node(Extractor.node_hash(uri, '/'))
62
- selector = Node(nil)
63
- presentation = Node(nil)
64
-
65
- selector.rdf::type = Node('sc:UnivocalSelector')
66
- selector.sc::path = '/'
67
- selector.sc::document = uri
49
+ uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
50
+ selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
51
+ select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }
68
52
 
69
- fragment.sc::selector = selector
70
-
71
- triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
72
-
73
- content.search('*').each do |node|
74
- next if node.text?
75
-
76
- fragment = Extractor.node_hash(uri, node.path)
77
-
78
- if referenceable == :dump or resources[fragment]
79
- selector = ID(nil)
80
- presentation = ID(nil)
81
-
82
- triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
83
- triples << [selector, ID('sc:path'), node.path.to_s]
84
- triples << [selector, ID('sc:tag'), node.name.to_s]
85
- triples << [selector, ID('sc:document'), uri]
86
-
87
- triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
88
- triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
89
- triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
90
- triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
91
- triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
92
- triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
93
- triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
94
- triples << [presentation, ID('sc:text'), node.text.strip]
95
-
96
- triples << [fragment, ID('sc:selector'), selector]
97
- triples << [fragment, ID('sc:presentation'), presentation]
98
- end
99
- end
100
- triples
101
- end
53
+ visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
102
54
 
103
- def self.node_hash uri, path
104
- digest = Digest::MD5.hexdigest("#{uri} #{path}")
105
- :"_:bnode#{digest}"
55
+ (uri_selectors + visual_selectors).map { |selector| fragments[selector] }
106
56
  end
107
57
  end
108
58
  end
@@ -2,101 +2,126 @@ module Sc
2
2
  class Fragment
3
3
  include RDF::NodeProxy
4
4
 
5
+ # Extracts data out of a document and returns an RDF::Graph
6
+ def extract_graph options={}
7
+ graph = RDF::Graph.new
8
+ extract(options).each { |node| graph << node }
9
+ graph
10
+ end
11
+
12
+ # Extracts data out of a document and returns an array of nodes
5
13
  def extract options={}
6
- uri = options[:doc][:uri]
14
+ # Extracts all the mappings and any subfragment
15
+ mappings(options).map do |result|
16
+ node = result[:node]
17
+ subfragments = result[:subfragments]
18
+ doc = result[:doc]
19
+
20
+ # Process subfragments
21
+ consistent = true
22
+ subfragments.each do |subfragment|
23
+ # Get subfragment object
24
+ subfragment = subfragment.proxy Node('sc:Fragment')
25
+
26
+ # Extract data from the subfragment
27
+ subnodes = subfragment.extract(options.merge(:doc=>doc))
28
+
29
+ # Add relations
30
+ subnodes.each do |subnode|
31
+ node.graph << subnode if subnode.is_a?(RDF::Node)
32
+ subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
33
+ end
34
+
35
+ # Check consistency
36
+ consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
37
+ consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
38
+ end
39
+
40
+ # Skip the node if it has inconsistent relations
41
+ # For example: extracting a sioc:Post with no dc:title would
42
+ # violate the constraint sc:min_cardinality = 1
43
+ next if !consistent
44
+
45
+ node
46
+ end.compact
47
+ end
7
48
 
49
+ # Returns all the mappings between this fragment and RDF nodes
50
+ def mappings options
8
51
  # Identify the fragment's mappings
9
52
  docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
10
53
 
11
- # Generate nodes for each page mapping
54
+ # Generate a result for each page mapping
12
55
  docs.map do |doc|
13
56
  # Build RDF nodes from identifier selectors (if present)
14
- nodes = self.nodes(uri, doc, options[:referenceable])
57
+ node = build_node(doc, options[:referenceable])
15
58
 
16
- # Add info to each node
17
- nodes.map do |node|
18
- # Build the object -- it can be a node or a literal
19
- object = if sc::type.include?(Node('rdf:Literal'))
20
- value = doc[:value].to_s.strip
21
- if options[:referenceable]
22
- node.rdf::value = value
23
- node.rdf::type = Node('rdf:Literal')
24
- node
25
- else
26
- value
27
- end
28
- else
29
- # Add statements about the node
30
- sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
31
- sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
32
- sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
59
+ # Skip the node if no URI or bnode is created
60
+ next if !node
61
+
62
+ # Add info to the node
33
63
 
64
+ # Build the object -- it can be a node or a literal
65
+ object = if sc::type.include?(Node('rdf:Literal'))
66
+ value = doc[:value].to_s.strip
67
+ if options[:referenceable]
68
+ node.rdf::value = value
69
+ node.rdf::type = Node('rdf:Literal')
34
70
  node
71
+ else
72
+ value
35
73
  end
74
+ else
75
+ # Add statements about the node
76
+ sc::type.each { |type| node.rdf::type += [type] if type != Node('rdf:Resource') }
77
+ sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
78
+ sc::sameas.each { |samenode| node.owl::sameAs += [samenode] }
36
79
 
37
- # Process subfragments
38
- consistent = true
39
- sc::subfragment.each do |subfragment|
40
- # Get subfragment object
41
- subfragment = graph.node(subfragment, Node('sc:Fragment'))
42
- # Extract data from the subfragment
43
- subnodes = subfragment.extract(options.merge(:doc=>doc))
44
-
45
- # Add relations
46
- subnodes.each do |subnode|
47
- node.graph << subnode if subnode.is_a?(RDF::Node)
48
- subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
49
- end
50
-
51
- # Check consistency
52
- consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
53
- consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
54
- end
80
+ node
81
+ end
55
82
 
56
- # Skip the node if it has inconsistent relations
57
- # For example: extracting a sioc:Post with no dc:title would
58
- # violate the constraint sc:min_cardinality = 1
59
- next if !consistent
60
-
61
- # Add referenceable data if requested
62
- if options[:referenceable]
63
- sources = [doc[:content]].flatten.map { |n| Node(Scrappy::Extractor.node_hash(doc[:uri], n.path)) }
64
- sources.each do |source|
65
- sc::type.each { |type| source.sc::type += [type] }
66
- sc::relation.each { |relation| source.sc::relation += [relation] }
67
- node.graph << source
68
- node.sc::source += [source]
69
- end
70
- end
71
-
72
- # Object points to either the node or the literal
73
- object
83
+ # Add referenceable data if requested
84
+ if options[:referenceable] and node.size > 0
85
+ source = reference(doc)
86
+ source.sc::type = sc::type
87
+ source.sc::superclass = sc::superclass
88
+ source.sc::sameas = sc::sameas
89
+ source.sc::relation = sc::relation
90
+ node.graph << source
91
+ node.sc::source = source
74
92
  end
75
- end.flatten.compact
93
+
94
+ # Variable object points to either a node or a literal
95
+ # Return the object, as well as its subfragments (if any)
96
+ # and the doc it was extracted from
97
+ { :node=>object, :subfragments=>sc::subfragment, :doc=>doc }
98
+ end.compact
76
99
  end
77
100
 
78
- def nodes uri, doc, referenceable
79
- nodes = sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
80
- node = Node(parse_uri(uri, d[:value]))
101
+ private
102
+ # Builds a node given a document
103
+ def build_node doc, referenceable
104
+ return Node(nil) if sc::identifier.empty?
105
+
106
+ sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
107
+ node = Node(parse_uri(d[:uri], d[:value]))
81
108
 
82
109
  if referenceable
83
110
  # Include the fragment where the URI was built from
84
- uri_node = Node(nil, node.graph)
85
- hash = Scrappy::Extractor.node_hash(d[:uri], d[:content].path)
86
-
87
- node.sc::uri = uri_node
111
+ uri_node = Node(nil)
112
+ source = reference(d)
113
+ uri_node.graph << source
88
114
  uri_node.rdf::value = node.to_s
89
- uri_node.sc::source = Node(hash)
115
+ uri_node.sc::source = source
116
+
117
+ node.graph << uri_node
118
+ node.sc::uri = uri_node
90
119
  end
91
120
 
92
121
  node
93
- end
94
- nodes << Node(nil) if nodes.empty?
95
-
96
- nodes
122
+ end.first
97
123
  end
98
124
 
99
- private
100
125
  # Parses a URI by resolving relative paths
101
126
  def parse_uri(uri, rel_uri)
102
127
  return ID('*') if rel_uri.nil?
@@ -107,5 +132,41 @@ module Sc
107
132
  end
108
133
  end
109
134
 
135
+ # Builds an RDF reference to an HTML node
136
+ def reference doc
137
+ node = doc[:content].is_a?(Nokogiri::XML::NodeSet) ? doc[:content].first.parent : doc[:content]
138
+ attribute = doc[:attribute]
139
+ uri = doc[:uri]
140
+
141
+ source = Node(nil)
142
+ selector = Node(nil)
143
+ presentation = Node(nil)
144
+
145
+ source.graph << selector
146
+ source.sc::selector = selector
147
+
148
+ selector.rdf::type = Node('sc:UnivocalSelector')
149
+ selector.sc::path = node.path
150
+ selector.sc::document = uri
151
+ selector.sc::attribute = attribute if attribute
152
+
153
+ if node.path != '/'
154
+ selector.sc::tag = node.name
155
+ source.graph << presentation
156
+ source.sc::presentation = presentation
157
+ end
158
+
159
+ presentation.sc::x = node[:vx] if node[:vx]
160
+ presentation.sc::y = node[:vy] if node[:vy]
161
+ presentation.sc::width = node[:vw] if node[:vw]
162
+ presentation.sc::height = node[:vh] if node[:vh]
163
+ presentation.sc::font_size = node[:vsize] if node[:vsize]
164
+ presentation.sc::font_family = node[:vfont] if node[:vfont]
165
+ presentation.sc::font_weight = node[:vweight] if node[:vweight]
166
+ presentation.sc::text = node.text.strip
167
+
168
+ source
169
+ end
170
+
110
171
  end
111
172
  end
@@ -3,30 +3,33 @@ module Sc
3
3
  def filter doc
4
4
  contents = if sc::attribute.first
5
5
  # Select node's attribute if given
6
- sc::attribute.map { |attribute| doc[:content][attribute] }
6
+ sc::attribute.map { |attribute| [doc[:content][attribute], attribute] }
7
7
  else
8
- [ doc[:value] ]
8
+ [ [doc[:value], nil] ]
9
9
  end
10
10
 
11
11
  @indexes ||= Hash.new(0)
12
12
  prefix = sc::prefix.first.to_s
13
- prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
13
+ prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
14
14
  suffix = sc::suffix.first.to_s
15
15
 
16
- contents.map do |content|
17
- variable = if sc::sequence.first.to_s=="true"
18
- @indexes[prefix] += 1
16
+ contents.map do |content, attribute|
17
+ new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
18
+ "#{content}#{suffix}"
19
19
  else
20
- if sc::downcase.first.to_s=="true"
21
- content.to_s.underscore
20
+ variable = if sc::sequence.first.to_s=="true"
21
+ @indexes[prefix] += 1
22
22
  else
23
- content.to_s.wikify
23
+ if sc::downcase.first.to_s=="true"
24
+ content.to_s.underscore
25
+ else
26
+ content.to_s.wikify
27
+ end
24
28
  end
29
+ "#{prefix}#{variable}#{suffix}"
25
30
  end
26
31
 
27
- new_uri = "#{prefix}#{variable}#{suffix}"
28
-
29
- { :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
32
+ { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
30
33
  end
31
34
  end
32
35
  end
@@ -3,7 +3,7 @@ module Sc
3
3
  def filter doc
4
4
  if sc::attribute.first
5
5
  # Select node's attribute if given
6
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
6
+ sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute], :attribute=>attribute } }
7
7
  else
8
8
  [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
9
9
  end
@@ -5,7 +5,7 @@ module Sc
5
5
  slices = doc[:value].split(separator)
6
6
  sc::index.map { |index| slices[index.to_i].to_s.strip }.
7
7
  select { |value| value != "" }.
8
- map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
8
+ map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value, :attribute=>doc[:attribute]} }
9
9
  end.flatten
10
10
  end
11
11
  end
@@ -1,37 +1,58 @@
1
1
  module Sc
2
2
  class VisualSelector < Selector
3
3
  def filter doc
4
+ # By initializing variables, we avoid getting data from a hash (slow)
5
+ min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
6
+ max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
7
+ min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
8
+ max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
9
+ min_x = (sc::min_x.first.to_i if sc::min_x.first)
10
+ max_x = (sc::max_x.first.to_i if sc::max_x.first)
11
+ min_y = (sc::min_y.first.to_i if sc::min_y.first)
12
+ max_y = (sc::max_y.first.to_i if sc::max_y.first)
13
+ min_width = (sc::min_width.first.to_i if sc::min_width.first)
14
+ max_width = (sc::max_width.first.to_i if sc::max_width.first)
15
+ min_height = (sc::min_height.first.to_i if sc::min_height.first)
16
+ max_height = (sc::max_height.first.to_i if sc::max_height.first)
17
+ min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
18
+ max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
19
+ min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
20
+ max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
21
+ font_family = sc::font_family.first
22
+ attributes = sc::attribute
23
+ formats = sc::format
24
+
4
25
  doc[:content].search(sc::tag.first || "*").select do |node|
5
26
  relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
6
27
  relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
7
28
 
8
29
  !node.text? and
9
- ( !sc::min_relative_x.first or relative_x >= sc::min_relative_x.first.to_i) and
10
- ( !sc::max_relative_x.first or relative_x <= sc::max_relative_x.first.to_i) and
11
- ( !sc::min_relative_y.first or relative_y >= sc::min_relative_y.first.to_i) and
12
- ( !sc::max_relative_y.first or relative_y <= sc::max_relative_y.first.to_i) and
30
+ ( !min_relative_x or relative_x >= min_relative_x) and
31
+ ( !max_relative_x or relative_x <= max_relative_x) and
32
+ ( !min_relative_y or relative_y >= min_relative_y) and
33
+ ( !max_relative_y or relative_y <= max_relative_y) and
13
34
 
14
- ( !sc::min_x.first or node['vx'].to_i >= sc::min_x.first.to_i) and
15
- ( !sc::max_x.first or node['vx'].to_i <= sc::max_x.first.to_i) and
16
- ( !sc::min_y.first or node['vy'].to_i >= sc::min_y.first.to_i) and
17
- ( !sc::max_y.first or node['vy'].to_i <= sc::max_y.first.to_i) and
35
+ ( !min_x or node['vx'].to_i >= min_x) and
36
+ ( !max_x or node['vx'].to_i <= max_x) and
37
+ ( !min_y or node['vy'].to_i >= min_y) and
38
+ ( !max_y or node['vy'].to_i <= max_y) and
18
39
 
19
- ( !sc::min_width.first or node['vw'].to_i >= sc::min_width.first.to_i) and
20
- ( !sc::max_width.first or node['vw'].to_i <= sc::max_width.first.to_i) and
21
- ( !sc::min_height.first or node['vh'].to_i >= sc::min_height.first.to_i) and
22
- ( !sc::max_height.first or node['vh'].to_i <= sc::max_height.first.to_i) and
40
+ ( !min_width or node['vw'].to_i >= min_width) and
41
+ ( !max_width or node['vw'].to_i <= max_width) and
42
+ ( !min_height or node['vh'].to_i >= min_height) and
43
+ ( !max_height or node['vh'].to_i <= max_height) and
23
44
 
24
- ( !sc::min_font_size.first or node['vsize'].to_i >= sc::min_font_size.first.to_i) and
25
- ( !sc::max_font_size.first or node['vsize'].to_i <= sc::max_font_size.first.to_i) and
26
- ( !sc::min_font_weight.first or node['vweight'].to_i >= sc::min_font_weight.first.to_i) and
27
- ( !sc::max_font_weight.first or node['vweight'].to_i <= sc::max_font_weight.first.to_i) and
28
- ( !sc::font_family.first or node['vfont'] == sc::font_family.first)
45
+ ( !min_font_size or node['vsize'].to_i >= min_font_size) and
46
+ ( !max_font_size or node['vsize'].to_i <= max_font_size) and
47
+ ( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
48
+ ( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
49
+ ( !font_family or node['vfont'] == font_family)
29
50
  end.map do |content|
30
- if sc::attribute.first
51
+ if attributes.first
31
52
  # Select node's attribute if given
32
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute] } }
53
+ attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
33
54
  else
34
- [ { :uri=>doc[:uri], :content=>content, :value=>format(content, sc::format, doc[:uri]) } ]
55
+ [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
35
56
  end
36
57
  end.flatten
37
58
  end
@@ -11,7 +11,7 @@ module Sc
11
11
  (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
12
12
  if sc::attribute.first
13
13
  # Select node's attribute if given
14
- sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
14
+ sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
15
15
  else
16
16
  # Select node
17
17
  [ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
@@ -0,0 +1,121 @@
1
+ module Scrappy
2
+ module Optimizer
3
+ # Iterates through a knowledge base and tries to merge and generalize
4
+ # selectors whenever the output of the resulting kb is the same
5
+ def optimize kb, sample
6
+ # Get the output only once
7
+ output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
8
+
9
+ # Build an array of fragments
10
+ root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
11
+ fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
12
+
13
+ # Parse the document
14
+ doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
15
+
16
+ begin
17
+ changed, fragments = optimize_once fragments, doc, output
18
+ end until !changed
19
+
20
+ graph = RDF::Graph.new
21
+ fragments.each { |fragment| graph << fragment }
22
+
23
+ graph
24
+ end
25
+
26
+ protected
27
+ # Tries to optimize a set of fragments.
28
+ # Returns true if there were changes, false otherwise,
29
+ # and the new fragments as the second array element
30
+ def optimize_once fragments, doc, output
31
+ fragments.each do |fragment1|
32
+ fragments.each do |fragment2|
33
+ next if fragment1 == fragment2
34
+ new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
35
+
36
+ # End if a new fragment was created
37
+ if new_fragment
38
+ return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
39
+ end
40
+ end
41
+ end
42
+ [false, fragments]
43
+ end
44
+
45
+ def mix_if_gain fragment1, fragment2, doc, output
46
+ # Won't get gain if the fragment does not produce the same kind of RDF resource
47
+ return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
48
+ !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
49
+ !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
50
+ !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
51
+ fragment1.sc::identifier.size != fragment2.sc::identifier.size
52
+
53
+ # Build new fragment
54
+ new_fragment = Node(nil)
55
+ new_fragment.rdf::type = fragment1.rdf::type
56
+ new_fragment.sc::type = fragment1.sc::type
57
+ new_fragment.sc::relation = fragment1.sc::relation
58
+ new_fragment.sc::superclass = fragment1.sc::superclass
59
+ new_fragment.sc::sameas = fragment1.sc::sameas
60
+
61
+ # sc:selector
62
+ selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
63
+ new_fragment.graph << selector
64
+ new_fragment.sc::selector = selector
65
+
66
+ # sc:identifier
67
+ if fragment1.sc::identifier.first
68
+ selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
69
+ new_fragment.graph << selector
70
+ new_fragment.sc::identifier = selector
71
+ end
72
+
73
+ # sc:subfragment
74
+ all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
75
+ all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
76
+ subfragments = all_subfragments.select do |sf|
77
+ sf.sc::type.sort_by(&:to_s) == types and
78
+ sf.sc::relation.sort_by(&:to_s) == relations
79
+ end
80
+ end
81
+
82
+ # Check new output
83
+ separate_output1 = fragment1.extract_graph :doc=>doc
84
+ separate_output2 = fragment2.extract_graph :doc=>doc
85
+ separate_output = separate_output1.merge separate_output2
86
+ new_output = new_fragment.proxy.extract_graph :doc=>doc
87
+
88
+ # Check if the output with the new fragment is a subset of the full output
89
+ # and if the output of the fragments alone is a subset of the output of the new
90
+ # fragment. This way we ensure the output is the same without using all the
91
+ # fragments that are available in the knowledge base.
92
+ new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
93
+ end
94
+
95
+ def generalize_selectors selectors
96
+ selector = Node(nil)
97
+ selector.rdf::type = Node('sc:VisualSelector')
98
+ selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
99
+ selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
100
+ selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
101
+ selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
102
+ selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
103
+ selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
104
+ selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
105
+ selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
106
+ selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
107
+ selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
108
+ selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
109
+ selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
110
+ selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
111
+ selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
112
+ selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
113
+ selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
114
+ selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family }.flatten.uniq.size == 1
115
+ selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag }.flatten.uniq.size == 1
116
+ selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute }.flatten.uniq.size == 1
117
+
118
+ selector
119
+ end
120
+ end
121
+ end
@@ -6,10 +6,6 @@ module Scrappy
6
6
  triples + train_sample(sample).triples
7
7
  end )
8
8
  end
9
-
10
- # Optimizes the knowledge base by generalizing patterns
11
- def optimize
12
- end
13
9
 
14
10
  private
15
11
  def train_sample sample
@@ -34,37 +30,31 @@ module Scrappy
34
30
  fragment.graph << selector
35
31
  fragment.sc::selector = selector
36
32
  when ID("sc:uri") then
37
- # Assumption: URIs are extracted from a link
38
33
  selector = selector_for(node.sc::uri.first.sc::source.first, node)
39
- selector.sc::tag = "a"
40
- selector.sc::attribute = "href"
41
-
42
34
  fragment.graph << selector
43
35
  fragment.sc::identifier = selector
44
36
  when ID("rdf:type") then
45
37
  fragment.sc::type = node.rdf::type
46
38
  else
47
- if node[predicate].map(&:class).uniq.first != String
48
- subfragments = node[predicate].map { |subnode| fragment_for(subnode, node) }
49
- # Mix the subfragments
50
- id = subfragments.first
51
- graph = RDF::Graph.new( subfragments.inject([]) do |triples, subfragment|
52
- triples + subfragment.graph.triples.map { |s,p,o| [s==subfragment.id ? id : s,p,o] }
53
- end )
54
- subfragment = graph[id]
55
- subfragment.sc::relation = Node(predicate)
56
- subfragment.sc::min_cardinality = "1"
39
+ if node[predicate].map(&:class).uniq.first == RDF::Node
40
+ node[predicate].map do |subnode|
41
+ subfragment = fragment_for(subnode, node)
42
+ subfragment.sc::relation = Node(predicate)
57
43
 
58
- fragment.graph << subfragment
59
- fragment.sc::subfragment += [subfragment]
44
+ fragment.graph << subfragment
45
+ fragment.sc::subfragment += [subfragment]
46
+ end
60
47
  end
61
48
  end
62
49
  end
63
- fragment.rdf::type = Node("sc:Fragment") if parent.nil?
50
+ fragment.rdf::type = Node("sc:Fragment")
51
+ fragment.sc::min_cardinality = "1"
52
+ fragment.sc::max_cardinality = "1"
64
53
  fragment
65
54
  end
66
55
 
67
56
  def selector_for fragment, parent=nil
57
+ fragment_selector = fragment.sc::selector.first
68
58
  presentation = fragment.sc::presentation.first
69
59
 
70
60
  selector = Node(nil)
@@ -94,6 +84,9 @@ module Scrappy
94
84
  selector.sc::min_font_weight = presentation.sc::font_weight
95
85
  selector.sc::max_font_weight = presentation.sc::font_weight
96
86
  selector.sc::font_family = presentation.sc::font_family
87
+
88
+ selector.sc::tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
89
+ selector.sc::attribute = fragment_selector.sc::attribute
97
90
 
98
91
  selector
99
92
  end
@@ -62,7 +62,7 @@ module Scrappy
62
62
  map { |node| node.sc::type }.flatten.map(&:to_s).sort
63
63
  haml :patterns
64
64
  end
65
-
65
+
66
66
  app.delete '/patterns/*' do |uri|
67
67
  Scrappy::App.delete_pattern uri
68
68
  flash[:notice] = "Pattern deleted"
@@ -94,6 +94,12 @@ module Scrappy
94
94
  redirect "#{settings.base_uri}/samples"
95
95
  end
96
96
 
97
+ app.post '/samples/:id/optimize' do |id|
98
+ Scrappy::App.save_patterns agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
99
+ flash[:notice] = "Optimization completed"
100
+ redirect "#{settings.base_uri}/samples"
101
+ end
102
+
97
103
  app.post '/samples' do
98
104
  html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
99
105
  sample = Scrappy::App.add_sample(:html=>html, :uri=>params[:uri], :date=>Time.now)
@@ -29,4 +29,28 @@ class String
29
29
  tr("-", "_").
30
30
  downcase
31
31
  end
32
+ end
33
+
34
+ class Array
35
+ # Return true if a given array has the same elements as this one
36
+ def equivalent? array
37
+ self.all? { |i| array.include?(i) } and
38
+ array.all? { |i| self.include?(i) }
39
+ end
40
+ end
41
+
42
+ module RDF
43
+ class Node
44
+ def self.mix *nodes
45
+ id = nodes.first
46
+ graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
47
+ triples + node.graph.triples.map do |s,p,o|
48
+ [ s==node.id ? id : s,
49
+ p==node.id ? id : p,
50
+ o==node.id ? id : o ]
51
+ end
52
+ end )
53
+ graph[id]
54
+ end
55
+ end
32
56
  end
data/lib/scrappy.rb CHANGED
@@ -15,7 +15,8 @@ require 'scrappy/support'
15
15
  require 'scrappy/repository'
16
16
 
17
17
  require 'scrappy/extractor/extractor'
18
- require 'scrappy/trainer/trainer'
18
+ require 'scrappy/learning/trainer'
19
+ require 'scrappy/learning/optimizer'
19
20
  require 'scrappy/agent/map_reduce'
20
21
  require 'scrappy/agent/cache'
21
22
  require 'scrappy/agent/dumper'
@@ -23,5 +24,5 @@ require 'scrappy/agent/blind_agent'
23
24
  require 'scrappy/agent/agent'
24
25
 
25
26
  module Scrappy
26
- VERSION = '0.3.2'
27
+ VERSION = '0.3.3'
27
28
  end
@@ -16,16 +16,28 @@ var add_visual_data = function() {
16
16
  item.setAttribute('vy', y);
17
17
  item.setAttribute('vw', item.offsetWidth);
18
18
  item.setAttribute('vh', item.offsetHeight);
19
- var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
20
- size = size.substring(0, size.length-2);
21
- item.setAttribute('vsize', size);
22
- var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
23
- var font = fonts[fonts.length-1].trim();
24
- item.setAttribute('vfont', font);
25
- var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
26
- if (weight == 'normal') weight = 400;
27
- if (weight == 'bold') weight = 700;
28
- item.setAttribute('vweight', weight);
19
+
20
+ var item_with_text = false;
21
+ for (var k=0; k<item.childNodes.length; k++) {
22
+ child = item.childNodes[k]
23
+ if (child.nodeName == "#text" && child.textContent.trim() != "") {
24
+ item_with_text = true;
25
+ break;
26
+ }
27
+ }
28
+
29
+ if (item_with_text) {
30
+ var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
31
+ size = size.substring(0, size.length-2);
32
+ item.setAttribute('vsize', size);
33
+ var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
34
+ var font = fonts[fonts.length-1].trim();
35
+ item.setAttribute('vfont', font);
36
+ var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
37
+ if (weight == 'normal') weight = 400;
38
+ if (weight == 'bold') weight = 700;
39
+ item.setAttribute('vweight', weight);
40
+ }
29
41
  }
30
42
  }
31
43
 
@@ -166,7 +166,7 @@ ul.detail li span.name, ul.detail li span.short_name {
166
166
  font-family: monospace;
167
167
  }
168
168
  ul.detail li span.short_name {
169
- width: 420px;
169
+ width: 350px;
170
170
  }
171
171
  ul.detail li span.format {
172
172
  float: right;
data/scrappy.gemspec CHANGED
@@ -2,30 +2,30 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.3.2"
5
+ s.version = "0.3.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-03-18}
9
+ s.date = %q{2011-03-25}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
18
18
  s.require_paths = ["lib"]
19
19
  s.rubyforge_project = %q{scrappy}
20
- s.rubygems_version = %q{1.3.7}
20
+ s.rubygems_version = %q{1.3.6}
21
21
  s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
22
- s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
22
+ s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
23
23
 
24
24
  if s.respond_to? :specification_version then
25
25
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26
26
  s.specification_version = 3
27
27
 
28
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
28
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29
29
  s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
30
30
  s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
31
31
  s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
data/views/samples.haml CHANGED
@@ -21,6 +21,8 @@
21
21
  -[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
22
22
  %span.format
23
23
  %a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
24
+ %span.format
25
+ %a{:href=>"#{settings.base_uri}/samples/#{i}/optimize", :'data-method'=>:post} Optimize
24
26
  %span.format
25
27
  %a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
26
28
  %span.date
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 3
9
- - 2
10
- version: 0.3.2
8
+ - 3
9
+ version: 0.3.3
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jose Ignacio
@@ -15,18 +14,16 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-03-18 00:00:00 +01:00
17
+ date: 2011-03-25 00:00:00 +01:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: activesupport
23
22
  prerelease: false
24
23
  requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
24
  requirements:
27
25
  - - ">="
28
26
  - !ruby/object:Gem::Version
29
- hash: 9
30
27
  segments:
31
28
  - 2
32
29
  - 3
@@ -38,11 +35,9 @@ dependencies:
38
35
  name: sinatra
39
36
  prerelease: false
40
37
  requirement: &id002 !ruby/object:Gem::Requirement
41
- none: false
42
38
  requirements:
43
39
  - - ">="
44
40
  - !ruby/object:Gem::Version
45
- hash: 23
46
41
  segments:
47
42
  - 1
48
43
  - 1
@@ -54,11 +49,9 @@ dependencies:
54
49
  name: thin
55
50
  prerelease: false
56
51
  requirement: &id003 !ruby/object:Gem::Requirement
57
- none: false
58
52
  requirements:
59
53
  - - ">="
60
54
  - !ruby/object:Gem::Version
61
- hash: 17
62
55
  segments:
63
56
  - 1
64
57
  - 2
@@ -70,11 +63,9 @@ dependencies:
70
63
  name: nokogiri
71
64
  prerelease: false
72
65
  requirement: &id004 !ruby/object:Gem::Requirement
73
- none: false
74
66
  requirements:
75
67
  - - ">="
76
68
  - !ruby/object:Gem::Version
77
- hash: 5
78
69
  segments:
79
70
  - 1
80
71
  - 4
@@ -86,11 +77,9 @@ dependencies:
86
77
  name: mechanize
87
78
  prerelease: false
88
79
  requirement: &id005 !ruby/object:Gem::Requirement
89
- none: false
90
80
  requirements:
91
81
  - - ">="
92
82
  - !ruby/object:Gem::Version
93
- hash: 23
94
83
  segments:
95
84
  - 1
96
85
  - 0
@@ -102,11 +91,9 @@ dependencies:
102
91
  name: lightrdf
103
92
  prerelease: false
104
93
  requirement: &id006 !ruby/object:Gem::Requirement
105
- none: false
106
94
  requirements:
107
95
  - - ">="
108
96
  - !ruby/object:Gem::Version
109
- hash: 19
110
97
  segments:
111
98
  - 0
112
99
  - 3
@@ -118,11 +105,9 @@ dependencies:
118
105
  name: i18n
119
106
  prerelease: false
120
107
  requirement: &id007 !ruby/object:Gem::Requirement
121
- none: false
122
108
  requirements:
123
109
  - - ">="
124
110
  - !ruby/object:Gem::Version
125
- hash: 11
126
111
  segments:
127
112
  - 0
128
113
  - 4
@@ -134,11 +119,9 @@ dependencies:
134
119
  name: rest-client
135
120
  prerelease: false
136
121
  requirement: &id008 !ruby/object:Gem::Requirement
137
- none: false
138
122
  requirements:
139
123
  - - ">="
140
124
  - !ruby/object:Gem::Version
141
- hash: 13
142
125
  segments:
143
126
  - 1
144
127
  - 6
@@ -150,11 +133,9 @@ dependencies:
150
133
  name: haml
151
134
  prerelease: false
152
135
  requirement: &id009 !ruby/object:Gem::Requirement
153
- none: false
154
136
  requirements:
155
137
  - - ">="
156
138
  - !ruby/object:Gem::Version
157
- hash: 55
158
139
  segments:
159
140
  - 3
160
141
  - 0
@@ -166,11 +147,9 @@ dependencies:
166
147
  name: rack-flash
167
148
  prerelease: false
168
149
  requirement: &id010 !ruby/object:Gem::Requirement
169
- none: false
170
150
  requirements:
171
151
  - - ">="
172
152
  - !ruby/object:Gem::Version
173
- hash: 25
174
153
  segments:
175
154
  - 0
176
155
  - 1
@@ -187,6 +166,7 @@ extensions: []
187
166
  extra_rdoc_files:
188
167
  - README.rdoc
189
168
  - bin/scrappy
169
+ - extractors/elmundo.yarf
190
170
  - lib/scrappy.rb
191
171
  - lib/scrappy/agent/agent.rb
192
172
  - lib/scrappy/agent/blind_agent.rb
@@ -207,20 +187,21 @@ extra_rdoc_files:
207
187
  - lib/scrappy/extractor/selectors/uri_pattern.rb
208
188
  - lib/scrappy/extractor/selectors/visual.rb
209
189
  - lib/scrappy/extractor/selectors/xpath.rb
190
+ - lib/scrappy/learning/optimizer.rb
191
+ - lib/scrappy/learning/trainer.rb
210
192
  - lib/scrappy/repository.rb
211
193
  - lib/scrappy/server/admin.rb
212
194
  - lib/scrappy/server/errors.rb
213
195
  - lib/scrappy/server/helpers.rb
214
196
  - lib/scrappy/server/server.rb
215
197
  - lib/scrappy/support.rb
216
- - lib/scrappy/trainer/trainer.rb
217
198
  files:
218
199
  - History.txt
219
200
  - Manifest
220
201
  - README.rdoc
221
202
  - Rakefile
222
203
  - bin/scrappy
223
- - kb/elmundo.yarf
204
+ - extractors/elmundo.yarf
224
205
  - lib/scrappy.rb
225
206
  - lib/scrappy/agent/agent.rb
226
207
  - lib/scrappy/agent/blind_agent.rb
@@ -241,13 +222,14 @@ files:
241
222
  - lib/scrappy/extractor/selectors/uri_pattern.rb
242
223
  - lib/scrappy/extractor/selectors/visual.rb
243
224
  - lib/scrappy/extractor/selectors/xpath.rb
225
+ - lib/scrappy/learning/optimizer.rb
226
+ - lib/scrappy/learning/trainer.rb
244
227
  - lib/scrappy/repository.rb
245
228
  - lib/scrappy/server/admin.rb
246
229
  - lib/scrappy/server/errors.rb
247
230
  - lib/scrappy/server/helpers.rb
248
231
  - lib/scrappy/server/server.rb
249
232
  - lib/scrappy/support.rb
250
- - lib/scrappy/trainer/trainer.rb
251
233
  - public/favicon.ico
252
234
  - public/images/logo.png
253
235
  - public/images/logo_tiny.png
@@ -278,20 +260,16 @@ rdoc_options:
278
260
  require_paths:
279
261
  - lib
280
262
  required_ruby_version: !ruby/object:Gem::Requirement
281
- none: false
282
263
  requirements:
283
264
  - - ">="
284
265
  - !ruby/object:Gem::Version
285
- hash: 3
286
266
  segments:
287
267
  - 0
288
268
  version: "0"
289
269
  required_rubygems_version: !ruby/object:Gem::Requirement
290
- none: false
291
270
  requirements:
292
271
  - - ">="
293
272
  - !ruby/object:Gem::Version
294
- hash: 11
295
273
  segments:
296
274
  - 1
297
275
  - 2
@@ -299,10 +277,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
299
277
  requirements: []
300
278
 
301
279
  rubyforge_project: scrappy
302
- rubygems_version: 1.3.7
280
+ rubygems_version: 1.3.6
303
281
  signing_key:
304
282
  specification_version: 3
305
283
  summary: Web scraper that allows producing RDF data out of plain web pages
306
284
  test_files:
307
- - test/test_helper.rb
308
285
  - test/test_scrappy.rb
286
+ - test/test_helper.rb