scrappy 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,4 +1,8 @@
1
- === 0.1.1 2010-09-30
1
+ === 0.1.2 2010-11-03
2
+
3
+ * Fix for script portability (shebang arguments)
4
+
5
+ === 0.1.1 2010-10-30
2
6
 
3
7
  * Extremely basic annotator tool
4
8
  * Visual agent forces no concurrency to prevent memory leaks
data/Manifest CHANGED
@@ -1,4 +1,5 @@
1
1
  History.txt
2
+ Manifest
2
3
  README.rdoc
3
4
  Rakefile
4
5
  bin/scrappy
@@ -11,10 +12,16 @@ lib/scrappy/agent/cluster.rb
11
12
  lib/scrappy/agent/extractor.rb
12
13
  lib/scrappy/agent/visual_agent.rb
13
14
  lib/scrappy/proxy.rb
15
+ lib/scrappy/selectors/base_uri.rb
16
+ lib/scrappy/selectors/css.rb
17
+ lib/scrappy/selectors/root.rb
18
+ lib/scrappy/selectors/slice.rb
19
+ lib/scrappy/selectors/uri.rb
20
+ lib/scrappy/selectors/uri_pattern.rb
21
+ lib/scrappy/selectors/xpath.rb
14
22
  lib/scrappy/server.rb
15
23
  lib/scrappy/shell.rb
16
24
  lib/scrappy/support.rb
17
25
  lib/scrappy/webkit/webkit.rb
18
26
  test/test_helper.rb
19
27
  test/test_scrappy.rb
20
- Manifest
data/README.rdoc CHANGED
@@ -150,6 +150,12 @@ Additionally, some extra libraries are needed for certain features:
150
150
 
151
151
  * PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
152
152
 
153
+ == CONTRIBUTORS:
154
+
155
+ * José Ignacio Fernández
156
+
157
+ * Jacobo Blasco
158
+
153
159
  == LICENSE:
154
160
 
155
161
  (The MIT License)
data/bin/scrappy CHANGED
@@ -30,6 +30,7 @@ module Scrappy
30
30
  Options.port = 3434
31
31
  Options.concurrence = 10
32
32
  Agent::Options.depth = 1
33
+ args = ARGV.map { |arg| arg.split(" ") }.flatten
33
34
 
34
35
  OptionParser.new do |opts|
35
36
  opts.on('-V', '--version') { output_version; exit 0 }
@@ -48,8 +49,8 @@ module Scrappy
48
49
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
49
50
  opts.on('-w', '--window') { Agent::Options.window = true }
50
51
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
51
- end.parse!(ARGV)
52
- @file = ARGV.shift
52
+ end.parse!(args)
53
+ @file = args.shift
53
54
  end
54
55
 
55
56
  def run
@@ -5,10 +5,15 @@ module Scrappy
5
5
  def extract uri, html, referenceable=nil
6
6
  triples = []
7
7
  content = Nokogiri::HTML(html, nil, 'utf-8')
8
- uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')).select{ |n| n.rdf::value.include?(uri.match(/\A([^\?]*)(\?.*\Z)?/).captures.first) }
9
- uri_selectors += kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).select{|n| n.rdf::value.any?{|v| /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ =~ uri} }
8
+
9
+ uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).flatten.select do |uri_selector|
10
+ class_name = uri_selector.rdf::type.first.to_s.split('#').last
11
+ results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
12
+ !results.empty?
13
+ end
10
14
 
11
15
  fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
16
+
12
17
  fragments.each do |fragment|
13
18
  extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
14
19
  :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
@@ -69,38 +74,16 @@ module Scrappy
69
74
  end
70
75
 
71
76
  def filter selector, doc
72
- content = doc[:content]
73
- uri = doc[:uri]
74
- results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
75
- selector.rdf::type.include?(Node('sc:XPathSelector'))
76
- selector.rdf::value.map do |pattern|
77
- content.search(pattern).map do |result|
78
- if selector.sc::attribute.first
79
- # Select node's attribute if given
80
- selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
81
- else
82
- # Select node
83
- [ { :uri=>uri, :content=>result, :value=>result.text } ]
84
- end
85
- end
86
- end.flatten
87
-
88
- elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
89
- text = content.text
90
- selector.rdf::value.map do |separator|
91
- slices = text.split(separator)
92
- selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
93
- end.flatten
77
+ # From "BaseUriSelector" to "base_uri"
78
+ class_name = selector.rdf::type.first.to_s.split('#').last
94
79
 
95
- elsif selector.rdf::type.include?(Node('sc:BaseUriSelector'))
96
- [ { :uri=>uri, :content=>content, :value=>uri } ]
80
+ # Process selector
81
+ results = Kernel.const_get(class_name).filter selector, doc
97
82
 
98
- else
99
- [ { :uri=>uri, :content=>content, :value=>content.text } ]
100
- end
101
-
102
- # Process nested selectors, if any
83
+ # Return results if no nested selectors
103
84
  return results if selector.sc::selector.empty?
85
+
86
+ # Process nested selectors
104
87
  results.map do |result|
105
88
  selector.sc::selector.map { |s| filter s, result }
106
89
  end.flatten
@@ -0,0 +1,5 @@
1
+ module BaseUriSelector
2
+ def self.filter selector, doc
3
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:uri] } ]
4
+ end
5
+ end
@@ -0,0 +1,6 @@
1
+ module CssSelector
2
+ def self.filter selector, doc
3
+ # By using Nokogiri, CSS and XPath use the same search method
4
+ XPathSelector.filter selector, doc
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ module RootSelector
2
+ def self.filter selector, doc
3
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
4
+ end
5
+ end
@@ -0,0 +1,8 @@
1
+ module SliceSelector
2
+ def self.filter selector, doc
3
+ selector.rdf::value.map do |separator|
4
+ slices = doc[:content].text.split(separator)
5
+ selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
6
+ end.flatten
7
+ end
8
+ end
@@ -0,0 +1,10 @@
1
+ module UriSelector
2
+ def self.filter selector, doc
3
+ # Check if the UriSelector has this URI as value (without params: ?param1=value1&param2=value2)
4
+ if selector.rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
5
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
6
+ else
7
+ []
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module UriPatternSelector
2
+ def self.filter selector, doc
3
+ # Check if the uri fits the pattern
4
+ if selector.rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
5
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
6
+ else
7
+ []
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ module XPathSelector
2
+ def self.filter selector, doc
3
+ selector.rdf::value.map do |pattern|
4
+ doc[:content].search(pattern).map do |result|
5
+ if selector.sc::attribute.first
6
+ # Select node's attribute if given
7
+ selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
8
+ else
9
+ # Select node
10
+ [ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
11
+ end
12
+ end
13
+ end.flatten
14
+ end
15
+ end
data/lib/scrappy.rb CHANGED
@@ -11,6 +11,7 @@ require 'tmpdir'
11
11
  require 'lightrdf'
12
12
 
13
13
  require 'scrappy/support'
14
+
14
15
  require 'scrappy/agent/extractor'
15
16
  require 'scrappy/agent/cluster'
16
17
  require 'scrappy/agent/agent'
@@ -18,5 +19,8 @@ require 'scrappy/agent/agent'
18
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
19
20
 
20
21
  module Scrappy
21
- VERSION = '0.1.1'
22
+ VERSION = '0.1.2'
22
23
  end
24
+
25
+ # Require selectors
26
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-10-29}
9
+ s.date = %q{2010-11-03}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "Manifest", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 1
9
- version: 0.1.1
8
+ - 2
9
+ version: 0.1.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-29 00:00:00 +02:00
17
+ date: 2010-11-03 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -130,12 +130,20 @@ extra_rdoc_files:
130
130
  - lib/scrappy/agent/extractor.rb
131
131
  - lib/scrappy/agent/visual_agent.rb
132
132
  - lib/scrappy/proxy.rb
133
+ - lib/scrappy/selectors/base_uri.rb
134
+ - lib/scrappy/selectors/css.rb
135
+ - lib/scrappy/selectors/root.rb
136
+ - lib/scrappy/selectors/slice.rb
137
+ - lib/scrappy/selectors/uri.rb
138
+ - lib/scrappy/selectors/uri_pattern.rb
139
+ - lib/scrappy/selectors/xpath.rb
133
140
  - lib/scrappy/server.rb
134
141
  - lib/scrappy/shell.rb
135
142
  - lib/scrappy/support.rb
136
143
  - lib/scrappy/webkit/webkit.rb
137
144
  files:
138
145
  - History.txt
146
+ - Manifest
139
147
  - README.rdoc
140
148
  - Rakefile
141
149
  - bin/scrappy
@@ -148,13 +156,19 @@ files:
148
156
  - lib/scrappy/agent/extractor.rb
149
157
  - lib/scrappy/agent/visual_agent.rb
150
158
  - lib/scrappy/proxy.rb
159
+ - lib/scrappy/selectors/base_uri.rb
160
+ - lib/scrappy/selectors/css.rb
161
+ - lib/scrappy/selectors/root.rb
162
+ - lib/scrappy/selectors/slice.rb
163
+ - lib/scrappy/selectors/uri.rb
164
+ - lib/scrappy/selectors/uri_pattern.rb
165
+ - lib/scrappy/selectors/xpath.rb
151
166
  - lib/scrappy/server.rb
152
167
  - lib/scrappy/shell.rb
153
168
  - lib/scrappy/support.rb
154
169
  - lib/scrappy/webkit/webkit.rb
155
170
  - test/test_helper.rb
156
171
  - test/test_scrappy.rb
157
- - Manifest
158
172
  - scrappy.gemspec
159
173
  has_rdoc: true
160
174
  homepage: http://github.com/josei/scrappy