scrappy 0.1.15 → 0.1.16
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/bin/scrappy +8 -1
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/formats.rb +5 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/bin/scrappy
CHANGED
@@ -143,10 +143,17 @@ Copyright
|
|
143
143
|
open(cache_file) { |f| Marshal.load(f) }
|
144
144
|
else
|
145
145
|
# Load YARF files and cache kb
|
146
|
-
data = Dir["#{data_folder}/*"].inject(RDF::Graph.new)
|
146
|
+
data = Dir["#{data_folder}/*"].inject(RDF::Graph.new) do |kb, file|
|
147
|
+
extension = file.split('.').last.to_sym
|
148
|
+
graph = RDF::Parser.parse(extension, open(file).read)
|
149
|
+
kb.ns.merge! graph.ns
|
150
|
+
kb.merge!(extension==:ignore ? RDF::Graph.new : graph)
|
151
|
+
kb
|
152
|
+
end
|
147
153
|
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
148
154
|
data
|
149
155
|
end
|
156
|
+
RDF::QURI.ns.merge! Agent::Options.kb.ns
|
150
157
|
end
|
151
158
|
end
|
152
159
|
|
data/lib/scrappy.rb
CHANGED
@@ -11,9 +11,13 @@ module Scrappy
|
|
11
11
|
doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
|
12
12
|
doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
|
13
13
|
doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
|
14
|
+
doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
|
15
|
+
doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
|
16
|
+
doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }
|
17
|
+
doc.search("li li").each {|n| n.replace(Nokogiri::XML::Text.new("** #{n.text.strip}", n.document)) }
|
14
18
|
doc.search("li").each {|n| n.replace(Nokogiri::XML::Text.new("* #{n.text.strip}", n.document)) }
|
15
19
|
doc.search("ul").each {|n| n.replace(Nokogiri::XML::Text.new(n.text.strip, n.document)) }
|
16
|
-
doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre
|
20
|
+
doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre>#{n.text}</pre>", n.document)) }
|
17
21
|
doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
|
18
22
|
doc.text.strip
|
19
23
|
when Node('sc:Html') then
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.16"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-02-
|
9
|
+
s.date = %q{2011-02-09}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 16
|
9
|
+
version: 0.1.16
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-09 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|