scrappy 0.1.16 → 0.1.17
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/README.rdoc +3 -3
- data/bin/scrappy +3 -3
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/agent/formats.rb +5 -1
- data/lib/scrappy/selectors/root.rb +3 -1
- data/lib/scrappy/selectors/section.rb +2 -2
- data/lib/scrappy/selectors/xpath.rb +1 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -123,13 +123,13 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
123
123
|
require 'scrappy'
|
124
124
|
|
125
125
|
# Parse a knowledge base
|
126
|
-
kb = RDF::Parser.parse
|
126
|
+
kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
|
127
127
|
|
128
128
|
# Create an agent
|
129
|
-
agent =
|
129
|
+
agent = Scrappy::Agent.create :kb=>kb
|
130
130
|
|
131
131
|
# Get RDF output
|
132
|
-
output = agent.request :method=>:get, :uri=>'http://www.
|
132
|
+
output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
|
133
133
|
|
134
134
|
# Output all titles from the web page
|
135
135
|
titles = output.find([], Node('dc:title'), nil)
|
data/bin/scrappy
CHANGED
@@ -41,7 +41,7 @@ module Scrappy
|
|
41
41
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
42
42
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
43
43
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
44
|
-
opts.on('-i', '--interactive') { Options.shell = true }
|
44
|
+
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
45
45
|
opts.on('-s', '--server') { Options.server = true }
|
46
46
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
47
47
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
@@ -109,8 +109,8 @@ Options
|
|
109
109
|
-S, --proxy-server Runs web proxy
|
110
110
|
-P, --port PORT Selects port number (default is 3434)
|
111
111
|
-v, --visual Uses visual agent (slow)
|
112
|
-
-r, --reference Outputs referenceable data
|
113
|
-
-R, --reference-all Outputs all HTML referenceable data
|
112
|
+
-r, --reference Outputs referenceable data
|
113
|
+
-R, --reference-all Outputs all HTML referenceable data
|
114
114
|
-w, --window Shows browser window (requires -v)
|
115
115
|
|
116
116
|
Authors
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -5,7 +5,7 @@ module Scrappy
|
|
5
5
|
include MapReduce
|
6
6
|
include Cached
|
7
7
|
|
8
|
-
Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
|
8
|
+
Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
|
9
9
|
ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
|
10
10
|
:rdf => 'application/rdf+xml' }
|
11
11
|
|
@@ -41,7 +41,7 @@ module Scrappy
|
|
41
41
|
end
|
42
42
|
|
43
43
|
def map args, queue=nil
|
44
|
-
depth = args[:depth]
|
44
|
+
depth = args[:depth] || options.depth
|
45
45
|
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
46
46
|
|
47
47
|
# Expire cache
|
@@ -139,7 +139,7 @@ module Scrappy
|
|
139
139
|
print "Serializing..."; $stdout.flush
|
140
140
|
end
|
141
141
|
|
142
|
-
output = response.serialize request[:format]
|
142
|
+
output = response.serialize request[:format], @options.format_header
|
143
143
|
|
144
144
|
puts 'done!'if options.debug
|
145
145
|
|
@@ -1,16 +1,20 @@
|
|
1
1
|
module Scrappy
|
2
2
|
module Formats
|
3
3
|
|
4
|
-
def format node, formats
|
4
|
+
def format node, formats, uri
|
5
5
|
case formats.first
|
6
6
|
when Node('sc:WikiText') then
|
7
7
|
doc = Nokogiri::XML(node.to_html)
|
8
|
+
doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
|
8
9
|
doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
|
9
10
|
doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
|
10
11
|
doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
|
11
12
|
doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
|
12
13
|
doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
|
13
14
|
doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
|
15
|
+
doc.search("td").each {|n| n.replace(Nokogiri::XML::Text.new("<td>#{n.text.strip}</td>", n.document)) }
|
16
|
+
doc.search("tr").each {|n| n.replace(Nokogiri::XML::Text.new("<tr>#{n.text.strip}</tr>", n.document)) }
|
17
|
+
doc.search("table").each {|n| n.replace(Nokogiri::XML::Text.new("<table>#{n.text.strip}</table>", n.document)) }
|
14
18
|
doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
|
15
19
|
doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
|
16
20
|
doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }
|
@@ -1,10 +1,12 @@
|
|
1
1
|
module RootSelector
|
2
|
+
extend Scrappy::Formats
|
3
|
+
|
2
4
|
def self.filter selector, doc
|
3
5
|
if selector.sc::attribute.first
|
4
6
|
# Select node's attribute if given
|
5
7
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
6
8
|
else
|
7
|
-
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].
|
9
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], selector.sc::format, doc[:uri]) } ]
|
8
10
|
end
|
9
11
|
end
|
10
12
|
end
|
@@ -5,9 +5,9 @@ module SectionSelector
|
|
5
5
|
selector.rdf::value.map do |pattern|
|
6
6
|
doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
|
7
7
|
found = false
|
8
|
-
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
|
8
|
+
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
|
9
9
|
|
10
|
-
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
10
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
11
11
|
end
|
12
12
|
end.flatten
|
13
13
|
end
|
@@ -15,7 +15,7 @@ module XPathSelector
|
|
15
15
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
16
16
|
else
|
17
17
|
# Select node
|
18
|
-
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
|
18
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end.flatten
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.17"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-02-
|
9
|
+
s.date = %q{2011-02-15}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 17
|
9
|
+
version: 0.1.17
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-15 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|