scrappy 0.1.16 → 0.1.17

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.17 2011-02-15
2
+
3
+ * Enabling headless yarf serialization in shell mode
4
+ * Small fixes
5
+
1
6
  === 0.1.16 2011-02-09
2
7
 
3
8
  * Added support to RDF abbreviated output by loading kb's prefixes
data/README.rdoc CHANGED
@@ -123,13 +123,13 @@ scrappy offers many different interfaces to get RDF data from a web page:
123
123
  require 'scrappy'
124
124
 
125
125
  # Parse a knowledge base
126
- kb = RDF::Parser.parse(:rdf, open("kb.rdf").read)
126
+ kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
127
127
 
128
128
  # Create an agent
129
- agent = scrappy::Agent.create :kb=>kb
129
+ agent = Scrappy::Agent.create :kb=>kb
130
130
 
131
131
  # Get RDF output
132
- output = agent.request :method=>:get, :uri=>'http://www.example.com'
132
+ output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
133
133
 
134
134
  # Output all titles from the web page
135
135
  titles = output.find([], Node('dc:title'), nil)
data/bin/scrappy CHANGED
@@ -41,7 +41,7 @@ module Scrappy
41
41
  opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
42
42
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
43
43
  opts.on('-u', '--debug') { Agent::Options.debug = true }
44
- opts.on('-i', '--interactive') { Options.shell = true }
44
+ opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
45
45
  opts.on('-s', '--server') { Options.server = true }
46
46
  opts.on('-S', '--proxy-server') { Options.proxy = true }
47
47
  opts.on('-P P', '--port P') { |p| Options.port = p }
@@ -109,8 +109,8 @@ Options
109
109
  -S, --proxy-server Runs web proxy
110
110
  -P, --port PORT Selects port number (default is 3434)
111
111
  -v, --visual Uses visual agent (slow)
112
- -r, --reference Outputs referenceable data (requires -v)
113
- -R, --reference-all Outputs all HTML referenceable data (requires -v)
112
+ -r, --reference Outputs referenceable data
113
+ -R, --reference-all Outputs all HTML referenceable data
114
114
  -w, --window Shows browser window (requires -v)
115
115
 
116
116
  Authors
data/lib/scrappy.rb CHANGED
@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
21
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
22
 
23
23
  module Scrappy
24
- VERSION = '0.1.16'
24
+ VERSION = '0.1.17'
25
25
  end
26
26
 
27
27
  # Require selectors
@@ -5,7 +5,7 @@ module Scrappy
5
5
  include MapReduce
6
6
  include Cached
7
7
 
8
- Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
8
+ Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
9
9
  ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
10
10
  :rdf => 'application/rdf+xml' }
11
11
 
@@ -41,7 +41,7 @@ module Scrappy
41
41
  end
42
42
 
43
43
  def map args, queue=nil
44
- depth = args[:depth]
44
+ depth = args[:depth] || options.depth
45
45
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
46
46
 
47
47
  # Expire cache
@@ -139,7 +139,7 @@ module Scrappy
139
139
  print "Serializing..."; $stdout.flush
140
140
  end
141
141
 
142
- output = response.serialize request[:format]
142
+ output = response.serialize request[:format], @options.format_header
143
143
 
144
144
  puts 'done!'if options.debug
145
145
 
@@ -1,16 +1,20 @@
1
1
  module Scrappy
2
2
  module Formats
3
3
 
4
- def format node, formats
4
+ def format node, formats, uri
5
5
  case formats.first
6
6
  when Node('sc:WikiText') then
7
7
  doc = Nokogiri::XML(node.to_html)
8
+ doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
8
9
  doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
9
10
  doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
10
11
  doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
11
12
  doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
12
13
  doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
13
14
  doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
15
+ doc.search("td").each {|n| n.replace(Nokogiri::XML::Text.new("<td>#{n.text.strip}</td>", n.document)) }
16
+ doc.search("tr").each {|n| n.replace(Nokogiri::XML::Text.new("<tr>#{n.text.strip}</tr>", n.document)) }
17
+ doc.search("table").each {|n| n.replace(Nokogiri::XML::Text.new("<table>#{n.text.strip}</table>", n.document)) }
14
18
  doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
15
19
  doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
16
20
  doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }
@@ -1,10 +1,12 @@
1
1
  module RootSelector
2
+ extend Scrappy::Formats
3
+
2
4
  def self.filter selector, doc
3
5
  if selector.sc::attribute.first
4
6
  # Select node's attribute if given
5
7
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
6
8
  else
7
- [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
9
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], selector.sc::format, doc[:uri]) } ]
8
10
  end
9
11
  end
10
12
  end
@@ -5,9 +5,9 @@ module SectionSelector
5
5
  selector.rdf::value.map do |pattern|
6
6
  doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
7
7
  found = false
8
- content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
8
+ content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
9
9
 
10
- [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
10
+ [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
11
11
  end
12
12
  end.flatten
13
13
  end
@@ -15,7 +15,7 @@ module XPathSelector
15
15
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
16
16
  else
17
17
  # Select node
18
- [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
18
+ [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
19
19
  end
20
20
  end
21
21
  end.flatten
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.16"
5
+ s.version = "0.1.17"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-02-09}
9
+ s.date = %q{2011-02-15}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 16
9
- version: 0.1.16
8
+ - 17
9
+ version: 0.1.17
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-09 00:00:00 +01:00
17
+ date: 2011-02-15 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency