scrappy 0.1.15 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/bin/scrappy +8 -1
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/formats.rb +5 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/bin/scrappy
CHANGED
@@ -143,10 +143,17 @@ Copyright
|
|
143
143
|
open(cache_file) { |f| Marshal.load(f) }
|
144
144
|
else
|
145
145
|
# Load YARF files and cache kb
|
146
|
-
data = Dir["#{data_folder}/*"].inject(RDF::Graph.new)
|
146
|
+
data = Dir["#{data_folder}/*"].inject(RDF::Graph.new) do |kb, file|
|
147
|
+
extension = file.split('.').last.to_sym
|
148
|
+
graph = RDF::Parser.parse(extension, open(file).read)
|
149
|
+
kb.ns.merge! graph.ns
|
150
|
+
kb.merge!(extension==:ignore ? RDF::Graph.new : graph)
|
151
|
+
kb
|
152
|
+
end
|
147
153
|
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
148
154
|
data
|
149
155
|
end
|
156
|
+
RDF::QURI.ns.merge! Agent::Options.kb.ns
|
150
157
|
end
|
151
158
|
end
|
152
159
|
|
data/lib/scrappy.rb
CHANGED
@@ -11,9 +11,13 @@ module Scrappy
|
|
11
11
|
doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
|
12
12
|
doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
|
13
13
|
doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
|
14
|
+
doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
|
15
|
+
doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
|
16
|
+
doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }
|
17
|
+
doc.search("li li").each {|n| n.replace(Nokogiri::XML::Text.new("** #{n.text.strip}", n.document)) }
|
14
18
|
doc.search("li").each {|n| n.replace(Nokogiri::XML::Text.new("* #{n.text.strip}", n.document)) }
|
15
19
|
doc.search("ul").each {|n| n.replace(Nokogiri::XML::Text.new(n.text.strip, n.document)) }
|
16
|
-
doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre
|
20
|
+
doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre>#{n.text}</pre>", n.document)) }
|
17
21
|
doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
|
18
22
|
doc.text.strip
|
19
23
|
when Node('sc:Html') then
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.16"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-02-
|
9
|
+
s.date = %q{2011-02-09}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 16
|
9
|
+
version: 0.1.16
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-09 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|