scrappy 0.1.16 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/README.rdoc +3 -3
- data/bin/scrappy +3 -3
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/agent/formats.rb +5 -1
- data/lib/scrappy/selectors/root.rb +3 -1
- data/lib/scrappy/selectors/section.rb +2 -2
- data/lib/scrappy/selectors/xpath.rb +1 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -123,13 +123,13 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
123
123
|
require 'scrappy'
|
124
124
|
|
125
125
|
# Parse a knowledge base
|
126
|
-
kb = RDF::Parser.parse
|
126
|
+
kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
|
127
127
|
|
128
128
|
# Create an agent
|
129
|
-
agent =
|
129
|
+
agent = Scrappy::Agent.create :kb=>kb
|
130
130
|
|
131
131
|
# Get RDF output
|
132
|
-
output = agent.request :method=>:get, :uri=>'http://www.
|
132
|
+
output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
|
133
133
|
|
134
134
|
# Output all titles from the web page
|
135
135
|
titles = output.find([], Node('dc:title'), nil)
|
data/bin/scrappy
CHANGED
@@ -41,7 +41,7 @@ module Scrappy
|
|
41
41
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
42
42
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
43
43
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
44
|
-
opts.on('-i', '--interactive') { Options.shell = true }
|
44
|
+
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
45
45
|
opts.on('-s', '--server') { Options.server = true }
|
46
46
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
47
47
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
@@ -109,8 +109,8 @@ Options
|
|
109
109
|
-S, --proxy-server Runs web proxy
|
110
110
|
-P, --port PORT Selects port number (default is 3434)
|
111
111
|
-v, --visual Uses visual agent (slow)
|
112
|
-
-r, --reference Outputs referenceable data
|
113
|
-
-R, --reference-all Outputs all HTML referenceable data
|
112
|
+
-r, --reference Outputs referenceable data
|
113
|
+
-R, --reference-all Outputs all HTML referenceable data
|
114
114
|
-w, --window Shows browser window (requires -v)
|
115
115
|
|
116
116
|
Authors
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -5,7 +5,7 @@ module Scrappy
|
|
5
5
|
include MapReduce
|
6
6
|
include Cached
|
7
7
|
|
8
|
-
Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
|
8
|
+
Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
|
9
9
|
ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
|
10
10
|
:rdf => 'application/rdf+xml' }
|
11
11
|
|
@@ -41,7 +41,7 @@ module Scrappy
|
|
41
41
|
end
|
42
42
|
|
43
43
|
def map args, queue=nil
|
44
|
-
depth = args[:depth]
|
44
|
+
depth = args[:depth] || options.depth
|
45
45
|
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
46
46
|
|
47
47
|
# Expire cache
|
@@ -139,7 +139,7 @@ module Scrappy
|
|
139
139
|
print "Serializing..."; $stdout.flush
|
140
140
|
end
|
141
141
|
|
142
|
-
output = response.serialize request[:format]
|
142
|
+
output = response.serialize request[:format], @options.format_header
|
143
143
|
|
144
144
|
puts 'done!'if options.debug
|
145
145
|
|
@@ -1,16 +1,20 @@
|
|
1
1
|
module Scrappy
|
2
2
|
module Formats
|
3
3
|
|
4
|
-
def format node, formats
|
4
|
+
def format node, formats, uri
|
5
5
|
case formats.first
|
6
6
|
when Node('sc:WikiText') then
|
7
7
|
doc = Nokogiri::XML(node.to_html)
|
8
|
+
doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
|
8
9
|
doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
|
9
10
|
doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
|
10
11
|
doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
|
11
12
|
doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
|
12
13
|
doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
|
13
14
|
doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
|
15
|
+
doc.search("td").each {|n| n.replace(Nokogiri::XML::Text.new("<td>#{n.text.strip}</td>", n.document)) }
|
16
|
+
doc.search("tr").each {|n| n.replace(Nokogiri::XML::Text.new("<tr>#{n.text.strip}</tr>", n.document)) }
|
17
|
+
doc.search("table").each {|n| n.replace(Nokogiri::XML::Text.new("<table>#{n.text.strip}</table>", n.document)) }
|
14
18
|
doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
|
15
19
|
doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
|
16
20
|
doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }
|
@@ -1,10 +1,12 @@
|
|
1
1
|
module RootSelector
|
2
|
+
extend Scrappy::Formats
|
3
|
+
|
2
4
|
def self.filter selector, doc
|
3
5
|
if selector.sc::attribute.first
|
4
6
|
# Select node's attribute if given
|
5
7
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
6
8
|
else
|
7
|
-
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].
|
9
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], selector.sc::format, doc[:uri]) } ]
|
8
10
|
end
|
9
11
|
end
|
10
12
|
end
|
@@ -5,9 +5,9 @@ module SectionSelector
|
|
5
5
|
selector.rdf::value.map do |pattern|
|
6
6
|
doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
|
7
7
|
found = false
|
8
|
-
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
|
8
|
+
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
|
9
9
|
|
10
|
-
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
10
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
11
11
|
end
|
12
12
|
end.flatten
|
13
13
|
end
|
@@ -15,7 +15,7 @@ module XPathSelector
|
|
15
15
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
16
16
|
else
|
17
17
|
# Select node
|
18
|
-
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
|
18
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end.flatten
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.17"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-02-
|
9
|
+
s.date = %q{2011-02-15}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 17
|
9
|
+
version: 0.1.17
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-15 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|