RubyGems - scrappy - Versions diffs - 0.3.2 → 0.3.3 - Mend

scrappy 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/History.txt +7 -0
data/Manifest +3 -2
data/bin/scrappy +6 -5
data/{kb → extractors}/elmundo.yarf +0 -0
data/lib/scrappy/agent/agent.rb +1 -0
data/lib/scrappy/extractor/extractor.rb +14 -64
data/lib/scrappy/extractor/fragment.rb +131 -70
data/lib/scrappy/extractor/selectors/new_uri.rb +15 -12
data/lib/scrappy/extractor/selectors/root.rb +1 -1
data/lib/scrappy/extractor/selectors/slice.rb +1 -1
data/lib/scrappy/extractor/selectors/visual.rb +41 -20
data/lib/scrappy/extractor/selectors/xpath.rb +1 -1
data/lib/scrappy/learning/optimizer.rb +121 -0
data/lib/scrappy/{trainer → learning}/trainer.rb +14 -21
data/lib/scrappy/server/admin.rb +7 -1
data/lib/scrappy/support.rb +24 -0
data/lib/scrappy.rb +3 -2
data/public/javascripts/annotator.js +22 -10
data/public/stylesheets/application.css +1 -1
data/scrappy.gemspec +7 -7
data/views/samples.haml +2 -0
metadata +11 -33

data/History.txt CHANGED Viewed

@@ -1,3 +1,10 @@
+=== 0.3.3 2011-03-25
+* Fix in NewUriSelector
+* Improved extraction process
+* Removed -R option
+* Removed irrelevant references to base URIs
 === 0.3.2 2011-03-18
 * Correction of issue with certain Ruby versions

data/Manifest CHANGED Viewed

@@ -3,7 +3,7 @@ Manifest
 README.rdoc
 Rakefile
 bin/scrappy
-kb/elmundo.yarf
+extractors/elmundo.yarf
 lib/scrappy.rb
 lib/scrappy/agent/agent.rb
 lib/scrappy/agent/blind_agent.rb
@@ -24,13 +24,14 @@ lib/scrappy/extractor/selectors/uri.rb
 lib/scrappy/extractor/selectors/uri_pattern.rb
 lib/scrappy/extractor/selectors/visual.rb
 lib/scrappy/extractor/selectors/xpath.rb
+lib/scrappy/learning/optimizer.rb
+lib/scrappy/learning/trainer.rb
 lib/scrappy/repository.rb
 lib/scrappy/server/admin.rb
 lib/scrappy/server/errors.rb
 lib/scrappy/server/helpers.rb
 lib/scrappy/server/server.rb
 lib/scrappy/support.rb
-lib/scrappy/trainer/trainer.rb
 public/favicon.ico
 public/images/logo.png
 public/images/logo_tiny.png

data/bin/scrappy CHANGED Viewed

@@ -40,8 +40,7 @@ module Scrappy
         opts.on('-c C', '--concurrence C')      { |c| Agent::Options.workers = c.to_i }
         opts.on('-d D', '--delay D')            { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
         opts.on('-l L', '--levels L')           { |l| Agent::Options.depth = l.to_i }
-        opts.on('-r', '--reference')            { Agent::Options.referenceable = :minimum }
-        opts.on('-R', '--reference-all')        { Agent::Options.referenceable = :dump }
+        opts.on('-r', '--reference')            { Agent::Options.referenceable = true }
         opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
         opts.on('-t TIME', '--time TIME')       { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
         opts.on('-o URIs', '--observe URIs')    { |uris| Options.observe = uris.split(',') }
@@ -106,9 +105,12 @@ module Scrappy
     end
     def self.add_pattern graph
       new_patterns = Scrappy::Kb.patterns.merge graph
-      open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
+      save_patterns new_patterns
       onload
     end
+    def self.save_patterns new_patterns
+      open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
+    end
     def self.delete_pattern uri
       graph = Scrappy::Kb.patterns
       fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
@@ -169,8 +171,7 @@ Options
   -a, --admin [ROOT]       Runs admin web server (optionally specify server's root url)
   -P, --port PORT          Selects port number (default is 3434)
   -t, --time TIME          Returns repository data from the last given minutes
-  -r, --reference          Outputs referenceable data
-  -R, --reference-all      Outputs all HTML referenceable data
+  -r, --reference          Outputs reference information
 Authors
   José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco

data/{kb → extractors}/elmundo.yarf RENAMED Viewed

File without changes

data/lib/scrappy/agent/agent.rb CHANGED Viewed

@@ -3,6 +3,7 @@ module Scrappy
     include MonitorMixin
     include Extractor
     include Trainer
+    include Optimizer
     include MapReduce
     include Cached
     include BlindAgent

data/lib/scrappy/extractor/extractor.rb CHANGED Viewed

@@ -28,9 +28,6 @@ module Scrappy
           end
         end
-        # Add references to sources if requested
-        triples += add_referenceable_data uri, content, triples, referenceable if referenceable
         puts "done!" if self.options.debug
         triples
@@ -38,71 +35,24 @@ module Scrappy
     end
     def fragments_for kb, uri
-      uri_selectors = ( kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
-                        kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
-                        flatten.select do |uri_selector|
-        !kb.node(uri_selector).filter(:uri=>uri).empty?
+      root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
+      selectors = []
+      fragments = {}
+      root_fragments.each do |fragment|
+        fragment.sc::selector.each do |selector|
+          fragments[selector] = fragment
+          selectors << selector
+        end
       end
-      visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
-      selectors = uri_selectors + visual_selectors
-      selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
-                flatten.
-                select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
-    end
-    private
-    def add_referenceable_data uri, content, given_triples, referenceable
-      triples = []
-      resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
-      fragment = Node(Extractor.node_hash(uri, '/'))
-      selector = Node(nil)
-      presentation = Node(nil)
-      selector.rdf::type = Node('sc:UnivocalSelector')
-      selector.sc::path = '/'
-      selector.sc::document = uri
+      uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
+                                                    selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
+                                select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }
-      fragment.sc::selector = selector
-      triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
-      content.search('*').each do |node|
-        next if node.text?
-        fragment = Extractor.node_hash(uri, node.path)
-        if referenceable == :dump or resources[fragment]
-          selector     = ID(nil)
-          presentation = ID(nil)
-          triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
-          triples << [selector, ID('sc:path'), node.path.to_s]
-          triples << [selector, ID('sc:tag'), node.name.to_s]
-          triples << [selector, ID('sc:document'), uri]
-          triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
-          triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
-          triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
-          triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
-          triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
-          triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
-          triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
-          triples << [presentation, ID('sc:text'), node.text.strip]
-          triples << [fragment, ID('sc:selector'), selector]
-          triples << [fragment, ID('sc:presentation'), presentation]
-        end
-      end
-      triples
-    end
+      visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
-    def self.node_hash uri, path
-      digest = Digest::MD5.hexdigest("#{uri} #{path}")
-      :"_:bnode#{digest}"
+      (uri_selectors + visual_selectors).map { |selector| fragments[selector] }
     end
   end
 end

data/lib/scrappy/extractor/fragment.rb CHANGED Viewed

@@ -2,101 +2,126 @@ module Sc
   class Fragment
     include RDF::NodeProxy
+    # Extracts data out of a document and returns an RDF::Graph
+    def extract_graph options={}
+      graph = RDF::Graph.new
+      extract(options).each { |node| graph << node }
+      graph
+    end
+    # Extracts data out of a document and returns an array of nodes
     def extract options={}
-      uri    = options[:doc][:uri]
+      # Extracts all the mappings and any subfragment
+      mappings(options).map do |result|
+        node         = result[:node]
+        subfragments = result[:subfragments]
+        doc          = result[:doc]
+        # Process subfragments
+        consistent = true
+        subfragments.each do |subfragment|
+          # Get subfragment object
+          subfragment = subfragment.proxy Node('sc:Fragment')
+          # Extract data from the subfragment
+          subnodes  = subfragment.extract(options.merge(:doc=>doc))
+          # Add relations
+          subnodes.each do |subnode|
+            node.graph << subnode if subnode.is_a?(RDF::Node)
+            subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
+          end
+          # Check consistency
+          consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
+          consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
+        end
+        # Skip the node if it has inconsistent relations
+        # For example: extracting a sioc:Post with no dc:title would
+        # violate the constraint sc:min_cardinality = 1
+        next if !consistent
+        node
+      end.compact
+    end
+    # Returns all the mappings between this fragment and RDF nodes
+    def mappings options
       # Identify the fragment's mappings
       docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten
-      # Generate nodes for each page mapping
+      # Generate a result for each page mapping
       docs.map do |doc|
         # Build RDF nodes from identifier selectors (if present)
-        nodes = self.nodes(uri, doc, options[:referenceable])
+        node = build_node(doc, options[:referenceable])
-        # Add info to each node
-        nodes.map do |node|
-          # Build the object -- it can be a node or a literal
-          object = if sc::type.include?(Node('rdf:Literal'))
-            value = doc[:value].to_s.strip
-            if options[:referenceable]
-              node.rdf::value = value
-              node.rdf::type  = Node('rdf:Literal')
-              node
-            else
-              value
-            end
-          else
-            # Add statements about the node
-            sc::type.each       { |type|       node.rdf::type += [type] if type != Node('rdf:Resource') }
-            sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
-            sc::sameas.each     { |samenode|   node.owl::sameAs += [samenode] }
+        # Skip the node if no URI or bnode is created
+        next if !node
+        # Add info to the node
+        # Build the object -- it can be a node or a literal
+        object = if sc::type.include?(Node('rdf:Literal'))
+          value = doc[:value].to_s.strip
+          if options[:referenceable]
+            node.rdf::value = value
+            node.rdf::type  = Node('rdf:Literal')
             node
+          else
+            value
           end
+        else
+          # Add statements about the node
+          sc::type.each       { |type|       node.rdf::type += [type] if type != Node('rdf:Resource') }
+          sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
+          sc::sameas.each     { |samenode|   node.owl::sameAs += [samenode] }
-          # Process subfragments
-          consistent = true
-          sc::subfragment.each do |subfragment|
-            # Get subfragment object
-            subfragment = graph.node(subfragment, Node('sc:Fragment'))
-            # Extract data from the subfragment
-            subnodes    = subfragment.extract(options.merge(:doc=>doc))
-            # Add relations
-            subnodes.each do |subnode|
-              node.graph << subnode if subnode.is_a?(RDF::Node)
-              subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
-            end
-            # Check consistency
-            consistent = false if subfragment.sc::min_cardinality.first and subnodes.size < subfragment.sc::min_cardinality.first.to_i
-            consistent = false if subfragment.sc::max_cardinality.first and subnodes.size > subfragment.sc::max_cardinality.first.to_i
-          end
+          node
+        end
-          # Skip the node if it has inconsistent relations
-          # For example: extracting a sioc:Post with no dc:title would
-          # violate the constraint sc:min_cardinality = 1
-          next if !consistent
-          # Add referenceable data if requested
-          if options[:referenceable]
-            sources = [doc[:content]].flatten.map { |n| Node(Scrappy::Extractor.node_hash(doc[:uri], n.path)) }
-            sources.each do |source|
-              sc::type.each     { |type|     source.sc::type     += [type] }
-              sc::relation.each { |relation| source.sc::relation += [relation] }
-              node.graph << source
-              node.sc::source += [source]
-            end
-          end
-          # Object points to either the node or the literal
-          object
+        # Add referenceable data if requested
+        if options[:referenceable] and node.size > 0
+          source                = reference(doc)
+          source.sc::type       = sc::type
+          source.sc::superclass = sc::superclass
+          source.sc::sameas     = sc::sameas
+          source.sc::relation   = sc::relation
+          node.graph           << source
+          node.sc::source       = source
         end
-      end.flatten.compact
+        # Variable object points to either a node or a literal
+        # Return the object, as well as its subfragments (if any)
+        # and the doc it was extracted from
+        { :node=>object, :subfragments=>sc::subfragment, :doc=>doc }
+      end.compact
     end
-    def nodes uri, doc, referenceable
-      nodes = sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
-        node = Node(parse_uri(uri, d[:value]))
+    private
+    # Builds a node given a document
+    def build_node doc, referenceable
+      return Node(nil) if sc::identifier.empty?
+      sc::identifier.map { |s| graph.node(s).select doc }.flatten.map do |d|
+        node = Node(parse_uri(d[:uri], d[:value]))
         if referenceable
           # Include the fragment where the URI was built from
-          uri_node = Node(nil, node.graph)
-          hash     = Scrappy::Extractor.node_hash(d[:uri], d[:content].path)
-          node.sc::uri        = uri_node
+          uri_node            = Node(nil)
+          source              = reference(d)
+          uri_node.graph     << source
           uri_node.rdf::value = node.to_s
-          uri_node.sc::source = Node(hash)
+          uri_node.sc::source = source
+          node.graph  << uri_node
+          node.sc::uri = uri_node
         end
         node
-      end
-      nodes << Node(nil) if nodes.empty?
-      nodes
+      end.first
     end
-    private
     # Parses a URI by resolving relative paths
     def parse_uri(uri, rel_uri)
       return ID('*') if rel_uri.nil?
@@ -107,5 +132,41 @@ module Sc
       end
     end
+    # Builds an RDF reference to an HTML node
+    def reference doc
+      node      = doc[:content].is_a?(Nokogiri::XML::NodeSet) ? doc[:content].first.parent : doc[:content]
+      attribute = doc[:attribute]
+      uri       = doc[:uri]
+      source       = Node(nil)
+      selector     = Node(nil)
+      presentation = Node(nil)
+      source.graph       << selector
+      source.sc::selector = selector
+      selector.rdf::type           = Node('sc:UnivocalSelector')
+      selector.sc::path            = node.path
+      selector.sc::document        = uri
+      selector.sc::attribute       = attribute if attribute
+      if node.path != '/'
+        selector.sc::tag        = node.name
+        source.graph           << presentation
+        source.sc::presentation = presentation
+      end
+      presentation.sc::x           = node[:vx] if node[:vx]
+      presentation.sc::y           = node[:vy] if node[:vy]
+      presentation.sc::width       = node[:vw] if node[:vw]
+      presentation.sc::height      = node[:vh] if node[:vh]
+      presentation.sc::font_size   = node[:vsize] if node[:vsize]
+      presentation.sc::font_family = node[:vfont] if node[:vfont]
+      presentation.sc::font_weight = node[:vweight] if node[:vweight]
+      presentation.sc::text        = node.text.strip
+      source
+    end
   end
 end

data/lib/scrappy/extractor/selectors/new_uri.rb CHANGED Viewed

@@ -3,30 +3,33 @@ module Sc
     def filter doc
       contents = if sc::attribute.first
         # Select node's attribute if given
-        sc::attribute.map { |attribute| doc[:content][attribute] }
+        sc::attribute.map { |attribute| [doc[:content][attribute], attribute] }
       else
-        [ doc[:value] ]
+        [ [doc[:value], nil] ]
       end
       @indexes ||= Hash.new(0)
       prefix = sc::prefix.first.to_s
-      prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
+      prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
       suffix = sc::suffix.first.to_s
-      contents.map do |content|
-        variable = if sc::sequence.first.to_s=="true"
-          @indexes[prefix] += 1
+      contents.map do |content, attribute|
+        new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
+          "#{content}#{suffix}"
         else
-          if sc::downcase.first.to_s=="true"
-            content.to_s.underscore
+          variable = if sc::sequence.first.to_s=="true"
+            @indexes[prefix] += 1
           else
-            content.to_s.wikify
+            if sc::downcase.first.to_s=="true"
+              content.to_s.underscore
+            else
+              content.to_s.wikify
+            end
           end
+          "#{prefix}#{variable}#{suffix}"
         end
-        new_uri = "#{prefix}#{variable}#{suffix}"
-        { :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
+        { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute }
       end
     end
   end

data/lib/scrappy/extractor/selectors/root.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Sc
     def filter doc
       if sc::attribute.first
         # Select node's attribute if given
-        sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
+        sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute], :attribute=>attribute } }
       else
         [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
       end

data/lib/scrappy/extractor/selectors/slice.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Sc
         slices = doc[:value].split(separator)
         sc::index.map { |index| slices[index.to_i].to_s.strip }.
                   select { |value| value != "" }.
-                  map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
+                  map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value, :attribute=>doc[:attribute]} }
       end.flatten
     end
   end

data/lib/scrappy/extractor/selectors/visual.rb CHANGED Viewed

@@ -1,37 +1,58 @@
 module Sc
   class VisualSelector < Selector
     def filter doc
+      # By initializing variables, we avoid getting data from a hash (slow)
+      min_relative_x  = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
+      max_relative_x  = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
+      min_relative_y  = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
+      max_relative_y  = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
+      min_x           = (sc::min_x.first.to_i if sc::min_x.first)
+      max_x           = (sc::max_x.first.to_i if sc::max_x.first)
+      min_y           = (sc::min_y.first.to_i if sc::min_y.first)
+      max_y           = (sc::max_y.first.to_i if sc::max_y.first)
+      min_width       = (sc::min_width.first.to_i if sc::min_width.first)
+      max_width       = (sc::max_width.first.to_i if sc::max_width.first)
+      min_height      = (sc::min_height.first.to_i if sc::min_height.first)
+      max_height      = (sc::max_height.first.to_i if sc::max_height.first)
+      min_font_size   = (sc::min_font_size.first.to_i if sc::min_font_size.first)
+      max_font_size   = (sc::max_font_size.first.to_i if sc::max_font_size.first)
+      min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
+      max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
+      font_family     =  sc::font_family.first
+      attributes      =  sc::attribute
+      formats         =  sc::format
       doc[:content].search(sc::tag.first || "*").select do |node|
         relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
         relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
         !node.text? and
-        ( !sc::min_relative_x.first  or relative_x          >= sc::min_relative_x.first.to_i) and
-        ( !sc::max_relative_x.first  or relative_x          <= sc::max_relative_x.first.to_i) and
-        ( !sc::min_relative_y.first  or relative_y          >= sc::min_relative_y.first.to_i) and
-        ( !sc::max_relative_y.first  or relative_y          <= sc::max_relative_y.first.to_i) and
+        ( !min_relative_x  or relative_x          >= min_relative_x) and
+        ( !max_relative_x  or relative_x          <= max_relative_x) and
+        ( !min_relative_y  or relative_y          >= min_relative_y) and
+        ( !max_relative_y  or relative_y          <= max_relative_y) and
-        ( !sc::min_x.first           or node['vx'].to_i      >= sc::min_x.first.to_i) and
-        ( !sc::max_x.first           or node['vx'].to_i      <= sc::max_x.first.to_i) and
-        ( !sc::min_y.first           or node['vy'].to_i      >= sc::min_y.first.to_i) and
-        ( !sc::max_y.first           or node['vy'].to_i      <= sc::max_y.first.to_i) and
+        ( !min_x           or node['vx'].to_i      >= min_x) and
+        ( !max_x           or node['vx'].to_i      <= max_x) and
+        ( !min_y           or node['vy'].to_i      >= min_y) and
+        ( !max_y           or node['vy'].to_i      <= max_y) and
-        ( !sc::min_width.first       or node['vw'].to_i      >= sc::min_width.first.to_i) and
-        ( !sc::max_width.first       or node['vw'].to_i      <= sc::max_width.first.to_i) and
-        ( !sc::min_height.first      or node['vh'].to_i      >= sc::min_height.first.to_i) and
-        ( !sc::max_height.first      or node['vh'].to_i      <= sc::max_height.first.to_i) and
+        ( !min_width       or node['vw'].to_i      >= min_width) and
+        ( !max_width       or node['vw'].to_i      <= max_width) and
+        ( !min_height      or node['vh'].to_i      >= min_height) and
+        ( !max_height      or node['vh'].to_i      <= max_height) and
-        ( !sc::min_font_size.first   or node['vsize'].to_i   >= sc::min_font_size.first.to_i) and
-        ( !sc::max_font_size.first   or node['vsize'].to_i   <= sc::max_font_size.first.to_i) and
-        ( !sc::min_font_weight.first or node['vweight'].to_i >= sc::min_font_weight.first.to_i) and
-        ( !sc::max_font_weight.first or node['vweight'].to_i <= sc::max_font_weight.first.to_i) and
-        ( !sc::font_family.first     or node['vfont']        == sc::font_family.first)
+        ( !min_font_size   or node['vsize'].to_i   >= min_font_size) and
+        ( !max_font_size   or node['vsize'].to_i   <= max_font_size) and
+        ( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
+        ( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
+        ( !font_family     or node['vfont']        == font_family)
       end.map do |content|
-        if sc::attribute.first
+        if attributes.first
           # Select node's attribute if given
-          sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute] } }
+          attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
         else
-          [ { :uri=>doc[:uri], :content=>content, :value=>format(content, sc::format, doc[:uri]) } ]
+          [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
         end
       end.flatten
     end

data/lib/scrappy/extractor/selectors/xpath.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Sc
         (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
           if sc::attribute.first
             # Select node's attribute if given
-            sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
+            sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute], :attribute=>attribute } }
           else
             # Select node
             [ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]

data/lib/scrappy/learning/optimizer.rb ADDED Viewed

@@ -0,0 +1,121 @@
+module Scrappy
+  module Optimizer
+    # Iterates through a knowledge base and tries to merge and generalize
+    # selectors whenever the output of the resulting kb is the same
+    def optimize kb, sample
+      # Get the output only once
+      output = RDF::Graph.new extract(sample[:uri], sample[:html], kb)
+      # Build an array of fragments
+      root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
+      fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
+      # Parse the document
+      doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
+      begin
+        changed, fragments = optimize_once fragments, doc, output
+      end until !changed
+      graph = RDF::Graph.new
+      fragments.each { |fragment| graph << fragment }
+      graph
+    end
+    protected
+    # Tries to optimize a set of fragments.
+    # Returns true if there were changes, false otherwise,
+    # and the new fragments as the second array element
+    def optimize_once fragments, doc, output
+      fragments.each do |fragment1|
+        fragments.each do |fragment2|
+          next if fragment1 == fragment2
+          new_fragment = mix_if_gain(fragment1, fragment2, doc, output)
+          # End if a new fragment was created
+          if new_fragment
+            return [true, fragments - [fragment1] - [fragment2] + [new_fragment]]
+          end
+        end
+      end
+      [false, fragments]
+    end
+    def mix_if_gain fragment1, fragment2, doc, output
+      # Won't get gain if the fragment does not produce the same kind of RDF resource
+      return if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
+                !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
+                !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
+                !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
+                 fragment1.sc::identifier.size != fragment2.sc::identifier.size
+      # Build new fragment
+      new_fragment = Node(nil)
+      new_fragment.rdf::type      = fragment1.rdf::type
+      new_fragment.sc::type       = fragment1.sc::type
+      new_fragment.sc::relation   = fragment1.sc::relation
+      new_fragment.sc::superclass = fragment1.sc::superclass
+      new_fragment.sc::sameas     = fragment1.sc::sameas
+      # sc:selector
+      selector = generalize_selectors(fragment1.sc::selector + fragment2.sc::selector)
+      new_fragment.graph       << selector
+      new_fragment.sc::selector = selector
+      # sc:identifier
+      if fragment1.sc::identifier.first
+        selector = generalize_selectors(fragment1.sc::identifier + fragment2.sc::identifier)
+        new_fragment.graph         << selector
+        new_fragment.sc::identifier = selector
+      end
+      # sc:subfragment
+      all_subfragments = fragment1.sc::subfragment + fragment2.sc::subfragment
+      all_subfragments.map { |sf| [sf.sc::type.sort_by(&:to_s), sf.sc::relation.sort_by(&:to_s)] }.uniq.each do |types, relations|
+        subfragments = all_subfragments.select do |sf|
+          sf.sc::type.sort_by(&:to_s)     == types and
+          sf.sc::relation.sort_by(&:to_s) == relations
+        end
+      end
+      # Check new output
+      separate_output1 = fragment1.extract_graph :doc=>doc
+      separate_output2 = fragment2.extract_graph :doc=>doc
+      separate_output  = separate_output1.merge separate_output2
+      new_output       = new_fragment.proxy.extract_graph :doc=>doc
+      # Check if the output with the new fragment is a subset of the full output
+      # and if the output of the fragments alone is a subset of the output of the new
+      # fragment. This way we ensure the output is the same without using all the
+      # fragments that are available in the knowledge base.
+      new_fragment.proxy # if output.contains?(new_output) and new_output.contains?(separate_output)
+    end
+    def generalize_selectors selectors
+      selector = Node(nil)
+      selector.rdf::type           = Node('sc:VisualSelector')
+      selector.sc::min_relative_x  = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
+      selector.sc::max_relative_x  = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
+      selector.sc::min_relative_y  = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
+      selector.sc::max_relative_y  = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
+      selector.sc::min_x           = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
+      selector.sc::max_x           = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
+      selector.sc::min_y           = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
+      selector.sc::max_y           = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
+      selector.sc::min_width       = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
+      selector.sc::max_width       = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
+      selector.sc::min_height      = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
+      selector.sc::max_height      = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
+      selector.sc::min_font_size   = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
+      selector.sc::max_font_size   = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
+      selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
+      selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
+      selector.sc::font_family     = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family }.flatten.uniq.size == 1
+      selector.sc::tag             = selectors.first.sc::tag if selectors.map { |s| s.sc::tag }.flatten.uniq.size == 1
+      selector.sc::attribute       = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute }.flatten.uniq.size == 1
+      selector
+    end
+  end
+end

data/lib/scrappy/{trainer → learning}/trainer.rb RENAMED Viewed

@@ -6,10 +6,6 @@ module Scrappy
         triples + train_sample(sample).triples
       end )
     end
-    # Optimizes the knowledge base by generalizing patterns
-    def optimize
-    end
     private
     def train_sample sample
@@ -34,37 +30,31 @@ module Scrappy
           fragment.graph << selector
           fragment.sc::selector = selector
         when ID("sc:uri") then
-          # Assumption: URIs are extracted from a link
           selector = selector_for(node.sc::uri.first.sc::source.first, node)
-          selector.sc::tag = "a"
-          selector.sc::attribute = "href"
           fragment.graph << selector
           fragment.sc::identifier = selector
         when ID("rdf:type") then
           fragment.sc::type = node.rdf::type
         else
-          if node[predicate].map(&:class).uniq.first != String
-            subfragments = node[predicate].map { |subnode| fragment_for(subnode, node) }
-            # Mix the subfragments
-            id = subfragments.first
-            graph = RDF::Graph.new( subfragments.inject([]) do |triples, subfragment|
-              triples + subfragment.graph.triples.map { |s,p,o| [s==subfragment.id ? id : s,p,o] }
-            end )
-            subfragment = graph[id]
-            subfragment.sc::relation = Node(predicate)
-            subfragment.sc::min_cardinality = "1"
+          if node[predicate].map(&:class).uniq.first == RDF::Node
+            node[predicate].map do |subnode|
+              subfragment = fragment_for(subnode, node)
+              subfragment.sc::relation = Node(predicate)
-            fragment.graph << subfragment
-            fragment.sc::subfragment += [subfragment]
+              fragment.graph << subfragment
+              fragment.sc::subfragment += [subfragment]
+            end
           end
         end
       end
-      fragment.rdf::type = Node("sc:Fragment") if parent.nil?
+      fragment.rdf::type = Node("sc:Fragment")
+      fragment.sc::min_cardinality = "1"
+      fragment.sc::max_cardinality = "1"
       fragment
     end
     def selector_for fragment, parent=nil
+      fragment_selector = fragment.sc::selector.first
       presentation = fragment.sc::presentation.first
       selector = Node(nil)
@@ -94,6 +84,9 @@ module Scrappy
       selector.sc::min_font_weight = presentation.sc::font_weight
       selector.sc::max_font_weight = presentation.sc::font_weight
       selector.sc::font_family     = presentation.sc::font_family
+      selector.sc::tag       = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
+      selector.sc::attribute = fragment_selector.sc::attribute
       selector
     end

data/lib/scrappy/server/admin.rb CHANGED Viewed

@@ -62,7 +62,7 @@ module Scrappy
                 map { |node| node.sc::type }.flatten.map(&:to_s).sort
         haml :patterns
       end
       app.delete '/patterns/*' do |uri|
         Scrappy::App.delete_pattern uri
         flash[:notice] = "Pattern deleted"
@@ -94,6 +94,12 @@ module Scrappy
         redirect "#{settings.base_uri}/samples"
       end
+      app.post '/samples/:id/optimize' do |id|
+        Scrappy::App.save_patterns agent.optimize(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
+        flash[:notice] = "Optimization completed"
+        redirect "#{settings.base_uri}/samples"
+      end
       app.post '/samples' do
         html   = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
         sample = Scrappy::App.add_sample(:html=>html, :uri=>params[:uri], :date=>Time.now)

data/lib/scrappy/support.rb CHANGED Viewed

@@ -29,4 +29,28 @@ class String
     tr("-", "_").
     downcase
   end
+end
+class Array
+  # Return true if a given array has the same elements as this one
+  def equivalent? array
+    self.all?  { |i| array.include?(i) } and
+    array.all? { |i| self.include?(i)  }
+  end
+end
+module RDF
+  class Node
+    def self.mix *nodes
+      id = nodes.first
+      graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
+        triples + node.graph.triples.map do |s,p,o|
+          [ s==node.id ? id : s,
+            p==node.id ? id : p,
+            o==node.id ? id : o ]
+        end
+      end )
+      graph[id]
+    end
+  end
 end

data/lib/scrappy.rb CHANGED Viewed

@@ -15,7 +15,8 @@ require 'scrappy/support'
 require 'scrappy/repository'
 require 'scrappy/extractor/extractor'
-require 'scrappy/trainer/trainer'
+require 'scrappy/learning/trainer'
+require 'scrappy/learning/optimizer'
 require 'scrappy/agent/map_reduce'
 require 'scrappy/agent/cache'
 require 'scrappy/agent/dumper'
@@ -23,5 +24,5 @@ require 'scrappy/agent/blind_agent'
 require 'scrappy/agent/agent'
 module Scrappy
-  VERSION = '0.3.2'
+  VERSION = '0.3.3'
 end

data/public/javascripts/annotator.js CHANGED Viewed

@@ -16,16 +16,28 @@ var add_visual_data = function() {
     item.setAttribute('vy', y);
     item.setAttribute('vw', item.offsetWidth);
     item.setAttribute('vh', item.offsetHeight);
-    var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
-    size = size.substring(0, size.length-2);
-    item.setAttribute('vsize', size);
-    var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
-    var font = fonts[fonts.length-1].trim();
-    item.setAttribute('vfont', font);
-    var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
-    if (weight == 'normal') weight = 400;
-    if (weight == 'bold')   weight = 700;
-    item.setAttribute('vweight', weight);
+    var item_with_text = false;
+    for (var k=0; k<item.childNodes.length; k++) {
+      child = item.childNodes[k]
+      if (child.nodeName == "#text" && child.textContent.trim() != "") {
+        item_with_text = true;
+        break;
+      }
+    }
+    if (item_with_text) {
+      var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
+      size = size.substring(0, size.length-2);
+      item.setAttribute('vsize', size);
+      var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
+      var font = fonts[fonts.length-1].trim();
+      item.setAttribute('vfont', font);
+      var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
+      if (weight == 'normal') weight = 400;
+      if (weight == 'bold')   weight = 700;
+      item.setAttribute('vweight', weight);
+    }
   }
 }

data/public/stylesheets/application.css CHANGED Viewed

@@ -166,7 +166,7 @@ ul.detail li span.name, ul.detail li span.short_name {
   font-family: monospace;
 }
 ul.detail li span.short_name {
-  width: 420px;
+  width: 350px;
 }
 ul.detail li span.format {
   float: right;

data/scrappy.gemspec CHANGED Viewed

@@ -2,30 +2,30 @@
 Gem::Specification.new do |s|
   s.name = %q{scrappy}
-  s.version = "0.3.2"
+  s.version = "0.3.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jose Ignacio"]
-  s.date = %q{2011-03-18}
+  s.date = %q{2011-03-25}
   s.default_executable = %q{scrappy}
   s.description = %q{RDF web scraper}
   s.email = %q{joseignacio.fernandez@gmail.com}
   s.executables = ["scrappy"]
-  s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb"]
-  s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
+  s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
+  s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "extractors/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/learning/optimizer.rb", "lib/scrappy/learning/trainer.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
   s.homepage = %q{http://github.com/josei/scrappy}
   s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
   s.require_paths = ["lib"]
   s.rubyforge_project = %q{scrappy}
-  s.rubygems_version = %q{1.3.7}
+  s.rubygems_version = %q{1.3.6}
   s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
-  s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
+  s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
   if s.respond_to? :specification_version then
     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
     s.specification_version = 3
-    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
       s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
       s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
       s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])

data/views/samples.haml CHANGED Viewed

@@ -21,6 +21,8 @@
             -[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
               %span.format
                 %a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
+            %span.format
+              %a{:href=>"#{settings.base_uri}/samples/#{i}/optimize", :'data-method'=>:post} Optimize
             %span.format
               %a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
             %span.date

metadata CHANGED Viewed

@@ -1,13 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: scrappy
 version: !ruby/object:Gem::Version
-  hash: 23
   prerelease: false
   segments:
   - 0
   - 3
-  - 2
-  version: 0.3.2
+  - 3
+  version: 0.3.3
 platform: ruby
 authors:
 - Jose Ignacio
@@ -15,18 +14,16 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-03-18 00:00:00 +01:00
+date: 2011-03-25 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 9
         segments:
         - 2
         - 3
@@ -38,11 +35,9 @@ dependencies:
   name: sinatra
   prerelease: false
   requirement: &id002 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 23
         segments:
         - 1
         - 1
@@ -54,11 +49,9 @@ dependencies:
   name: thin
   prerelease: false
   requirement: &id003 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 17
         segments:
         - 1
         - 2
@@ -70,11 +63,9 @@ dependencies:
   name: nokogiri
   prerelease: false
   requirement: &id004 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 5
         segments:
         - 1
         - 4
@@ -86,11 +77,9 @@ dependencies:
   name: mechanize
   prerelease: false
   requirement: &id005 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 23
         segments:
         - 1
         - 0
@@ -102,11 +91,9 @@ dependencies:
   name: lightrdf
   prerelease: false
   requirement: &id006 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 19
         segments:
         - 0
         - 3
@@ -118,11 +105,9 @@ dependencies:
   name: i18n
   prerelease: false
   requirement: &id007 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 11
         segments:
         - 0
         - 4
@@ -134,11 +119,9 @@ dependencies:
   name: rest-client
   prerelease: false
   requirement: &id008 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 13
         segments:
         - 1
         - 6
@@ -150,11 +133,9 @@ dependencies:
   name: haml
   prerelease: false
   requirement: &id009 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 55
         segments:
         - 3
         - 0
@@ -166,11 +147,9 @@ dependencies:
   name: rack-flash
   prerelease: false
   requirement: &id010 !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 25
         segments:
         - 0
         - 1
@@ -187,6 +166,7 @@ extensions: []
 extra_rdoc_files:
 - README.rdoc
 - bin/scrappy
+- extractors/elmundo.yarf
 - lib/scrappy.rb
 - lib/scrappy/agent/agent.rb
 - lib/scrappy/agent/blind_agent.rb
@@ -207,20 +187,21 @@ extra_rdoc_files:
 - lib/scrappy/extractor/selectors/uri_pattern.rb
 - lib/scrappy/extractor/selectors/visual.rb
 - lib/scrappy/extractor/selectors/xpath.rb
+- lib/scrappy/learning/optimizer.rb
+- lib/scrappy/learning/trainer.rb
 - lib/scrappy/repository.rb
 - lib/scrappy/server/admin.rb
 - lib/scrappy/server/errors.rb
 - lib/scrappy/server/helpers.rb
 - lib/scrappy/server/server.rb
 - lib/scrappy/support.rb
-- lib/scrappy/trainer/trainer.rb
 files:
 - History.txt
 - Manifest
 - README.rdoc
 - Rakefile
 - bin/scrappy
-- kb/elmundo.yarf
+- extractors/elmundo.yarf
 - lib/scrappy.rb
 - lib/scrappy/agent/agent.rb
 - lib/scrappy/agent/blind_agent.rb
@@ -241,13 +222,14 @@ files:
 - lib/scrappy/extractor/selectors/uri_pattern.rb
 - lib/scrappy/extractor/selectors/visual.rb
 - lib/scrappy/extractor/selectors/xpath.rb
+- lib/scrappy/learning/optimizer.rb
+- lib/scrappy/learning/trainer.rb
 - lib/scrappy/repository.rb
 - lib/scrappy/server/admin.rb
 - lib/scrappy/server/errors.rb
 - lib/scrappy/server/helpers.rb
 - lib/scrappy/server/server.rb
 - lib/scrappy/support.rb
-- lib/scrappy/trainer/trainer.rb
 - public/favicon.ico
 - public/images/logo.png
 - public/images/logo_tiny.png
@@ -278,20 +260,16 @@ rdoc_options:
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 11
       segments:
       - 1
       - 2
@@ -299,10 +277,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: scrappy
-rubygems_version: 1.3.7
+rubygems_version: 1.3.6
 signing_key:
 specification_version: 3
 summary: Web scraper that allows producing RDF data out of plain web pages
 test_files:
-- test/test_helper.rb
 - test/test_scrappy.rb
+- test/test_helper.rb