scrappy 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,4 +1,8 @@
1
- === 0.1.1 2010-09-30
1
+ === 0.1.2 2010-11-03
2
+
3
+ * Fix for script portability (shebang arguments)
4
+
5
+ === 0.1.1 2010-10-30
2
6
 
3
7
  * Extremely basic annotator tool
4
8
  * Visual agent forces no concurrency to prevent memory leaks
data/Manifest CHANGED
@@ -1,4 +1,5 @@
1
1
  History.txt
2
+ Manifest
2
3
  README.rdoc
3
4
  Rakefile
4
5
  bin/scrappy
@@ -11,10 +12,16 @@ lib/scrappy/agent/cluster.rb
11
12
  lib/scrappy/agent/extractor.rb
12
13
  lib/scrappy/agent/visual_agent.rb
13
14
  lib/scrappy/proxy.rb
15
+ lib/scrappy/selectors/base_uri.rb
16
+ lib/scrappy/selectors/css.rb
17
+ lib/scrappy/selectors/root.rb
18
+ lib/scrappy/selectors/slice.rb
19
+ lib/scrappy/selectors/uri.rb
20
+ lib/scrappy/selectors/uri_pattern.rb
21
+ lib/scrappy/selectors/xpath.rb
14
22
  lib/scrappy/server.rb
15
23
  lib/scrappy/shell.rb
16
24
  lib/scrappy/support.rb
17
25
  lib/scrappy/webkit/webkit.rb
18
26
  test/test_helper.rb
19
27
  test/test_scrappy.rb
20
- Manifest
data/README.rdoc CHANGED
@@ -150,6 +150,12 @@ Additionally, some extra libraries are needed for certain features:
150
150
 
151
151
  * PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
152
152
 
153
+ == CONTRIBUTORS:
154
+
155
+ * José Ignacio Fernández
156
+
157
+ * Jacobo Blasco
158
+
153
159
  == LICENSE:
154
160
 
155
161
  (The MIT License)
data/bin/scrappy CHANGED
@@ -30,6 +30,7 @@ module Scrappy
30
30
  Options.port = 3434
31
31
  Options.concurrence = 10
32
32
  Agent::Options.depth = 1
33
+ args = ARGV.map { |arg| arg.split(" ") }.flatten
33
34
 
34
35
  OptionParser.new do |opts|
35
36
  opts.on('-V', '--version') { output_version; exit 0 }
@@ -48,8 +49,8 @@ module Scrappy
48
49
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
49
50
  opts.on('-w', '--window') { Agent::Options.window = true }
50
51
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
51
- end.parse!(ARGV)
52
- @file = ARGV.shift
52
+ end.parse!(args)
53
+ @file = args.shift
53
54
  end
54
55
 
55
56
  def run
@@ -5,10 +5,15 @@ module Scrappy
5
5
  def extract uri, html, referenceable=nil
6
6
  triples = []
7
7
  content = Nokogiri::HTML(html, nil, 'utf-8')
8
- uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')).select{ |n| n.rdf::value.include?(uri.match(/\A([^\?]*)(\?.*\Z)?/).captures.first) }
9
- uri_selectors += kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).select{|n| n.rdf::value.any?{|v| /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ =~ uri} }
8
+
9
+ uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).flatten.select do |uri_selector|
10
+ class_name = uri_selector.rdf::type.first.to_s.split('#').last
11
+ results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
12
+ !results.empty?
13
+ end
10
14
 
11
15
  fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
16
+
12
17
  fragments.each do |fragment|
13
18
  extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
14
19
  :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
@@ -69,38 +74,16 @@ module Scrappy
69
74
  end
70
75
 
71
76
  def filter selector, doc
72
- content = doc[:content]
73
- uri = doc[:uri]
74
- results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
75
- selector.rdf::type.include?(Node('sc:XPathSelector'))
76
- selector.rdf::value.map do |pattern|
77
- content.search(pattern).map do |result|
78
- if selector.sc::attribute.first
79
- # Select node's attribute if given
80
- selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
81
- else
82
- # Select node
83
- [ { :uri=>uri, :content=>result, :value=>result.text } ]
84
- end
85
- end
86
- end.flatten
87
-
88
- elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
89
- text = content.text
90
- selector.rdf::value.map do |separator|
91
- slices = text.split(separator)
92
- selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
93
- end.flatten
77
+ # From "BaseUriSelector" to "base_uri"
78
+ class_name = selector.rdf::type.first.to_s.split('#').last
94
79
 
95
- elsif selector.rdf::type.include?(Node('sc:BaseUriSelector'))
96
- [ { :uri=>uri, :content=>content, :value=>uri } ]
80
+ # Process selector
81
+ results = Kernel.const_get(class_name).filter selector, doc
97
82
 
98
- else
99
- [ { :uri=>uri, :content=>content, :value=>content.text } ]
100
- end
101
-
102
- # Process nested selectors, if any
83
+ # Return results if no nested selectors
103
84
  return results if selector.sc::selector.empty?
85
+
86
+ # Process nested selectors
104
87
  results.map do |result|
105
88
  selector.sc::selector.map { |s| filter s, result }
106
89
  end.flatten
@@ -0,0 +1,5 @@
1
+ module BaseUriSelector
2
+ def self.filter selector, doc
3
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:uri] } ]
4
+ end
5
+ end
@@ -0,0 +1,6 @@
1
+ module CssSelector
2
+ def self.filter selector, doc
3
+ # By using Nokogiri, CSS and XPath use the same search method
4
+ XPathSelector.filter selector, doc
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ module RootSelector
2
+ def self.filter selector, doc
3
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
4
+ end
5
+ end
@@ -0,0 +1,8 @@
1
+ module SliceSelector
2
+ def self.filter selector, doc
3
+ selector.rdf::value.map do |separator|
4
+ slices = doc[:content].text.split(separator)
5
+ selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
6
+ end.flatten
7
+ end
8
+ end
@@ -0,0 +1,10 @@
1
+ module UriSelector
2
+ def self.filter selector, doc
3
+ # Check if the UriSelector has this URI as value (without params: ?param1=value1&param2=value2)
4
+ if selector.rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
5
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
6
+ else
7
+ []
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module UriPatternSelector
2
+ def self.filter selector, doc
3
+ # Check if the uri fits the pattern
4
+ if selector.rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
5
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
6
+ else
7
+ []
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ module XPathSelector
2
+ def self.filter selector, doc
3
+ selector.rdf::value.map do |pattern|
4
+ doc[:content].search(pattern).map do |result|
5
+ if selector.sc::attribute.first
6
+ # Select node's attribute if given
7
+ selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
8
+ else
9
+ # Select node
10
+ [ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
11
+ end
12
+ end
13
+ end.flatten
14
+ end
15
+ end
data/lib/scrappy.rb CHANGED
@@ -11,6 +11,7 @@ require 'tmpdir'
11
11
  require 'lightrdf'
12
12
 
13
13
  require 'scrappy/support'
14
+
14
15
  require 'scrappy/agent/extractor'
15
16
  require 'scrappy/agent/cluster'
16
17
  require 'scrappy/agent/agent'
@@ -18,5 +19,8 @@ require 'scrappy/agent/agent'
18
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
19
20
 
20
21
  module Scrappy
21
- VERSION = '0.1.1'
22
+ VERSION = '0.1.2'
22
23
  end
24
+
25
+ # Require selectors
26
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-10-29}
9
+ s.date = %q{2010-11-03}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "Manifest", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 1
9
- version: 0.1.1
8
+ - 2
9
+ version: 0.1.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-29 00:00:00 +02:00
17
+ date: 2010-11-03 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -130,12 +130,20 @@ extra_rdoc_files:
130
130
  - lib/scrappy/agent/extractor.rb
131
131
  - lib/scrappy/agent/visual_agent.rb
132
132
  - lib/scrappy/proxy.rb
133
+ - lib/scrappy/selectors/base_uri.rb
134
+ - lib/scrappy/selectors/css.rb
135
+ - lib/scrappy/selectors/root.rb
136
+ - lib/scrappy/selectors/slice.rb
137
+ - lib/scrappy/selectors/uri.rb
138
+ - lib/scrappy/selectors/uri_pattern.rb
139
+ - lib/scrappy/selectors/xpath.rb
133
140
  - lib/scrappy/server.rb
134
141
  - lib/scrappy/shell.rb
135
142
  - lib/scrappy/support.rb
136
143
  - lib/scrappy/webkit/webkit.rb
137
144
  files:
138
145
  - History.txt
146
+ - Manifest
139
147
  - README.rdoc
140
148
  - Rakefile
141
149
  - bin/scrappy
@@ -148,13 +156,19 @@ files:
148
156
  - lib/scrappy/agent/extractor.rb
149
157
  - lib/scrappy/agent/visual_agent.rb
150
158
  - lib/scrappy/proxy.rb
159
+ - lib/scrappy/selectors/base_uri.rb
160
+ - lib/scrappy/selectors/css.rb
161
+ - lib/scrappy/selectors/root.rb
162
+ - lib/scrappy/selectors/slice.rb
163
+ - lib/scrappy/selectors/uri.rb
164
+ - lib/scrappy/selectors/uri_pattern.rb
165
+ - lib/scrappy/selectors/xpath.rb
151
166
  - lib/scrappy/server.rb
152
167
  - lib/scrappy/shell.rb
153
168
  - lib/scrappy/support.rb
154
169
  - lib/scrappy/webkit/webkit.rb
155
170
  - test/test_helper.rb
156
171
  - test/test_scrappy.rb
157
- - Manifest
158
172
  - scrappy.gemspec
159
173
  has_rdoc: true
160
174
  homepage: http://github.com/josei/scrappy