scrappy 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.17 2011-02-15
2
+
3
+ * Enabling headless yarf serialization in shell mode
4
+ * Small fixes
5
+
1
6
  === 0.1.16 2011-02-09
2
7
 
3
8
  * Added support to RDF abbreviated output by loading kb's prefixes
data/README.rdoc CHANGED
@@ -123,13 +123,13 @@ scrappy offers many different interfaces to get RDF data from a web page:
123
123
  require 'scrappy'
124
124
 
125
125
  # Parse a knowledge base
126
- kb = RDF::Parser.parse(:rdf, open("kb.rdf").read)
126
+ kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
127
127
 
128
128
  # Create an agent
129
- agent = scrappy::Agent.create :kb=>kb
129
+ agent = Scrappy::Agent.create :kb=>kb
130
130
 
131
131
  # Get RDF output
132
- output = agent.request :method=>:get, :uri=>'http://www.example.com'
132
+ output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
133
133
 
134
134
  # Output all titles from the web page
135
135
  titles = output.find([], Node('dc:title'), nil)
data/bin/scrappy CHANGED
@@ -41,7 +41,7 @@ module Scrappy
41
41
  opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
42
42
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
43
43
  opts.on('-u', '--debug') { Agent::Options.debug = true }
44
- opts.on('-i', '--interactive') { Options.shell = true }
44
+ opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
45
45
  opts.on('-s', '--server') { Options.server = true }
46
46
  opts.on('-S', '--proxy-server') { Options.proxy = true }
47
47
  opts.on('-P P', '--port P') { |p| Options.port = p }
@@ -109,8 +109,8 @@ Options
109
109
  -S, --proxy-server Runs web proxy
110
110
  -P, --port PORT Selects port number (default is 3434)
111
111
  -v, --visual Uses visual agent (slow)
112
- -r, --reference Outputs referenceable data (requires -v)
113
- -R, --reference-all Outputs all HTML referenceable data (requires -v)
112
+ -r, --reference Outputs referenceable data
113
+ -R, --reference-all Outputs all HTML referenceable data
114
114
  -w, --window Shows browser window (requires -v)
115
115
 
116
116
  Authors
data/lib/scrappy.rb CHANGED
@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
21
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
22
 
23
23
  module Scrappy
24
- VERSION = '0.1.16'
24
+ VERSION = '0.1.17'
25
25
  end
26
26
 
27
27
  # Require selectors
@@ -5,7 +5,7 @@ module Scrappy
5
5
  include MapReduce
6
6
  include Cached
7
7
 
8
- Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
8
+ Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
9
9
  ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
10
10
  :rdf => 'application/rdf+xml' }
11
11
 
@@ -41,7 +41,7 @@ module Scrappy
41
41
  end
42
42
 
43
43
  def map args, queue=nil
44
- depth = args[:depth]
44
+ depth = args[:depth] || options.depth
45
45
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
46
46
 
47
47
  # Expire cache
@@ -139,7 +139,7 @@ module Scrappy
139
139
  print "Serializing..."; $stdout.flush
140
140
  end
141
141
 
142
- output = response.serialize request[:format]
142
+ output = response.serialize request[:format], @options.format_header
143
143
 
144
144
  puts 'done!'if options.debug
145
145
 
@@ -1,16 +1,20 @@
1
1
  module Scrappy
2
2
  module Formats
3
3
 
4
- def format node, formats
4
+ def format node, formats, uri
5
5
  case formats.first
6
6
  when Node('sc:WikiText') then
7
7
  doc = Nokogiri::XML(node.to_html)
8
+ doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
8
9
  doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
9
10
  doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
10
11
  doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
11
12
  doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
12
13
  doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
13
14
  doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
15
+ doc.search("td").each {|n| n.replace(Nokogiri::XML::Text.new("<td>#{n.text.strip}</td>", n.document)) }
16
+ doc.search("tr").each {|n| n.replace(Nokogiri::XML::Text.new("<tr>#{n.text.strip}</tr>", n.document)) }
17
+ doc.search("table").each {|n| n.replace(Nokogiri::XML::Text.new("<table>#{n.text.strip}</table>", n.document)) }
14
18
  doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
15
19
  doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
16
20
  doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }
@@ -1,10 +1,12 @@
1
1
  module RootSelector
2
+ extend Scrappy::Formats
3
+
2
4
  def self.filter selector, doc
3
5
  if selector.sc::attribute.first
4
6
  # Select node's attribute if given
5
7
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
6
8
  else
7
- [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
9
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], selector.sc::format, doc[:uri]) } ]
8
10
  end
9
11
  end
10
12
  end
@@ -5,9 +5,9 @@ module SectionSelector
5
5
  selector.rdf::value.map do |pattern|
6
6
  doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
7
7
  found = false
8
- content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
8
+ content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
9
9
 
10
- [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
10
+ [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
11
11
  end
12
12
  end.flatten
13
13
  end
@@ -15,7 +15,7 @@ module XPathSelector
15
15
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
16
16
  else
17
17
  # Select node
18
- [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
18
+ [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
19
19
  end
20
20
  end
21
21
  end.flatten
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.16"
5
+ s.version = "0.1.17"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-02-09}
9
+ s.date = %q{2011-02-15}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 16
9
- version: 0.1.16
8
+ - 17
9
+ version: 0.1.17
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-09 00:00:00 +01:00
17
+ date: 2011-02-15 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency