scrappy 0.1.24 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.2.0 2011-03-09
2
+
3
+ * Integration with cmoft's improvements: Sesame support, time-aggregated requests and observing URLs
4
+
1
5
  === 0.1.24 2011-03-08
2
6
 
3
7
  * Using RDF::NodeProxy from lightRDF 0.2
data/Manifest CHANGED
@@ -30,6 +30,7 @@ lib/scrappy/server/public/images/logo_small.png
30
30
  lib/scrappy/server/public/stylesheets/application.css
31
31
  lib/scrappy/server/views/home.haml
32
32
  lib/scrappy/server/views/help.haml
33
+ lib/scrappy/repository.rb
33
34
  lib/scrappy/shell.rb
34
35
  lib/scrappy/support.rb
35
36
  lib/scrappy/webkit/webkit.rb
data/README.rdoc CHANGED
@@ -138,8 +138,43 @@ scrappy offers many different interfaces to get RDF data from a web page:
138
138
  titles = output.find([], Node('dc:title'), nil)
139
139
  titles.each { |title| puts title }
140
140
 
141
- == INSTALL:
141
+ * RDF repository:
142
+
143
+ Sesame functionality has been included in Scrappy. You can configure
144
+ the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
145
+ An example of this file can be found at the end of this README.
146
+
147
+ You can get the data for a certain period of time, by using the time (-t, --time) option:
148
+
149
+ $ scrappy -g example.org -t 3
150
+
151
+ This would get all the data stored in Sesame for example.org in the last 3 minutes.
152
+
153
+ * Sample config.yml
142
154
 
155
+ # This is a sample configuration file, with the options to communicate with Sesame using Scrappy
156
+ repository:
157
+ # The host were Sesame is. Do not add the trailing '/'
158
+ host: http://localhost
159
+
160
+ # The port for the connection
161
+ port: 8080
162
+
163
+ # The time to consider the data in the repository valid, in minutes
164
+ time: 15
165
+
166
+ # The name of the repository
167
+ repository: memory
168
+
169
+ # The format to communicate with the repository
170
+ format: ntriples
171
+
172
+ # You can use any of the following formats:
173
+ # rdfxml, ntriples, turtle, n3, trix, trig
174
+
175
+
176
+ == INSTALL:
177
+
143
178
  Install it as any other gem:
144
179
 
145
180
  $ gem install scrappy
@@ -153,10 +188,15 @@ Additionally, some extra libraries are needed for certain features:
153
188
 
154
189
  * PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
155
190
 
191
+ In order to use Sesame, you will need to install it. Further instructions can be found
192
+ in the openRDF website, more precisely, in http://www.openrdf.org/doc/sesame2/users/ch06.html .
193
+
156
194
  == CONTRIBUTORS:
157
195
 
158
196
  * José Ignacio Fernández
159
197
 
198
+ * Alberto Mardomingo
199
+
160
200
  * Jacobo Blasco
161
201
 
162
202
  == LICENSE:
@@ -182,4 +222,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
182
222
  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
183
223
  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
184
224
  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
185
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
225
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
11
11
  p.email = "joseignacio.fernandez@gmail.com"
12
12
  p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
13
13
  p.ignore_pattern = ["pkg/*"]
14
- p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.0'], ['i18n', '>= 0.4.2'], ['haml', '>= 3.0.24']]
14
+ p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.1'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24']]
15
15
  end
16
16
 
17
17
  Rake::RDocTask.new(:rdoc) do |rdoc|
data/bin/scrappy CHANGED
@@ -1,7 +1,10 @@
1
1
  #!/usr/bin/ruby
2
2
  # encoding: UTF-8
3
3
 
4
- if !RUBY_PLATFORM.include?("mswin")
4
+ require 'rbconfig'
5
+ WINDOWS_PLATFORM = Config::CONFIG['host_os'] =~ /mswin|mingw/
6
+
7
+ if !WINDOWS_PLATFORM
5
8
  stty_save = `stty -g`.chomp
6
9
  trap('INT') { system('stty', stty_save); Scrappy::App.quit }
7
10
  end
@@ -31,8 +34,8 @@ module Scrappy
31
34
  OptionParser.new do |opts|
32
35
  opts.on('-v', '--version') { output_version; exit 0 }
33
36
  opts.on('-h', '--help') { output_help; exit 0 }
34
- opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
35
- opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
37
+ opts.on('-g URI', '--get URI') { |uri| Options.uri = uri; Options.http_method=:get }
38
+ opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
36
39
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
37
40
  opts.on('-u', '--debug') { Agent::Options.debug = true }
38
41
  opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
@@ -47,15 +50,19 @@ module Scrappy
47
50
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
48
51
  opts.on('-w', '--window') { Agent::Options.window = true }
49
52
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
53
+ opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
54
+ opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
50
55
  end.parse!(args)
51
56
  @file = args.shift
52
57
  end
53
58
 
54
59
  def run
55
60
  onload
56
- if Options.url
61
+ if Options.uri
57
62
  Options.quiet = true
58
- puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
63
+ puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
64
+ elsif Options.observe
65
+ Agent.create.observe(Options.observe)
59
66
  elsif Options.proxy
60
67
  puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
61
68
  require 'scrappy/server/proxy'
@@ -105,10 +112,12 @@ Options
105
112
  -D, --dump Dumps RDF data to disk
106
113
  -u, --debug Shows debugging traces
107
114
  -i, --interactive Runs interactive shell
115
+ -o, --observe URLs Observes the specified URLs storing their data into the repository
108
116
  -s, --server [ROOT] Runs web server (optionally specify server's root url)
109
117
  -S, --proxy-server Runs web proxy
110
118
  -P, --port PORT Selects port number (default is 3434)
111
119
  -V, --visual Uses visual agent (slow)
120
+ -t, --time DAYS Returns repository data from the last given minutes
112
121
  -r, --reference Outputs referenceable data
113
122
  -R, --reference-all Outputs all HTML referenceable data
114
123
  -w, --window Shows browser window (requires -v)
@@ -127,15 +136,23 @@ Copyright
127
136
 
128
137
  def onload
129
138
  # Check local or global knowledge base
130
- home = RUBY_PLATFORM.include?("mswin") ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
139
+ home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
140
+
141
+ data_dirname = "kb"
142
+ cache_dirname = "cache"
143
+ cache_filename = "scrappy-#{Scrappy::VERSION}.kb"
144
+ config_filename = "config.yml"
131
145
 
132
- if File.exists?("#{home}/kb")
133
- data_folder = "#{home}/kb"
134
- cache_file = "#{home}/kb.cache"
146
+ if File.exists?(File.join(home, data_dirname))
147
+ data_folder = File.join home, data_dirname
148
+ cache_folder = File.join home, cache_dirname
135
149
  else
136
- data_folder = "#{Scrappy::Root}/kb"
137
- cache_file = "#{Dir.tmpdir}/scrappy.kb.cache"
150
+ data_folder = File.join Scrappy::Root, data_dirname
151
+ cache_folder = Dir.tmpdir
138
152
  end
153
+ Dir.mkdir cache_folder if Dir[cache_folder].empty?
154
+ cache_file = File.join cache_folder, cache_filename
155
+ config_file = File.join home, config_filename
139
156
 
140
157
  # Load knowledge base
141
158
  Agent::Options.kb = if File.exists?(cache_file) and File.mtime(cache_file) >= Dir["#{data_folder}/*", data_folder].map{ |f| File.mtime(f) }.max
@@ -143,7 +160,7 @@ Copyright
143
160
  open(cache_file) { |f| Marshal.load(f) }
144
161
  else
145
162
  # Load YARF files and cache kb
146
- data = Dir["#{data_folder}/*"].inject(RDF::Graph.new) do |kb, file|
163
+ data = Dir[File.join(data_folder, "*")].inject(RDF::Graph.new) do |kb, file|
147
164
  extension = file.split('.').last.to_sym
148
165
  graph = RDF::Parser.parse(extension, open(file).read)
149
166
  kb.ns.merge! graph.ns
@@ -153,6 +170,17 @@ Copyright
153
170
  open(cache_file, "w") { |f| Marshal.dump(data, f) }
154
171
  data
155
172
  end
173
+
174
+ # Looks for a configuration file. If it does not exist, Scrappy does not uses Sesame
175
+ # It looks for it in the home .scrappy dir
176
+ if File.exist?(config_file)
177
+ config = YAML::load_file(config_file)["repository"]
178
+ # Convert the strings from the YAML file into symbols
179
+ repository_options = {}
180
+ config.each { |k,v| repository_options[k.to_sym] = v }
181
+ Agent::Options.repository = Repository.new repository_options
182
+ end
183
+
156
184
  RDF::ID.ns.merge! Agent::Options.kb.ns
157
185
  end
158
186
  end
data/lib/scrappy.rb CHANGED
@@ -10,6 +10,7 @@ require 'tmpdir'
10
10
  require 'lightrdf'
11
11
 
12
12
  require 'scrappy/support'
13
+ require 'scrappy/repository'
13
14
 
14
15
  require 'scrappy/agent/extractor'
15
16
  require 'scrappy/agent/map_reduce'
@@ -21,7 +22,7 @@ require 'scrappy/agent/agent'
21
22
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
23
 
23
24
  module Scrappy
24
- VERSION = '0.1.24'
25
+ VERSION = '0.2.0'
25
26
  end
26
27
 
27
28
  # Require selectors
@@ -38,6 +38,7 @@ module Scrappy
38
38
  Agent.pool[@id] = self
39
39
  @kb = args[:kb] || Options.kb
40
40
  @options = Options.clone
41
+ @repository = args[:repository] || Options.repository
41
42
  end
42
43
 
43
44
  def map args, queue=nil
@@ -52,51 +53,35 @@ module Scrappy
52
53
  puts "Retrieving cached #{request[:uri]}...done!" if options.debug
53
54
 
54
55
  cache[request][:response]
56
+ elsif @repository
57
+ # Extracts from the repository
58
+ request_from_repository(request)
55
59
  else
56
60
  # Perform the request
57
-
58
- sleep 0.001 * options.delay.to_f # Sleep if requested
59
-
60
- if options.debug
61
- print "Opening #{request[:uri]}..."; $stdout.flush
62
- end
63
-
64
- if request[:method] == :get
65
- self.uri = request[:uri]
66
- else
67
- raise Exception, 'POST requests not supported yet'
68
- end
69
-
70
- puts 'done!' if options.debug
71
-
72
- response = if self.html_data?
73
- add_visual_data! if options.referenceable # Adds tags including visual information
74
- extraction = extract self.uri, html, options.referenceable # Extract data
75
- Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
76
- extraction
77
- else
78
- []
79
- end
61
+ request_uncached(request)
62
+ end
80
63
 
64
+ # If previous cache exists, do not cache it again
65
+ unless cache[request]
81
66
  # Cache the request
82
- cache[request] = { :time=>Time.now, :response=>response }
83
- cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
84
-
85
- response
67
+ cache[request] = { :time=>Time.now, :response=>triples }
68
+ cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>triples } if self.uri
86
69
  end
87
70
 
88
71
  # Enqueue subresources
89
72
  # Pages are enqueued without reducing depth
90
- pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
73
+ pages = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
91
74
 
92
75
  # All other URIS are enqueued with depth reduced
93
76
  uris = if depth != 0
94
- (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
77
+ (triples.map { |s, p, o| [s,o] }.flatten - [ID(self.uri)] - pages).select{|n| n.is_a?(Symbol)}
95
78
  else
96
79
  []
97
80
  end
98
81
 
99
- items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} }).uniq
82
+ items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
83
+ uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
84
+ uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
100
85
 
101
86
  items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
102
87
 
@@ -120,7 +105,7 @@ module Scrappy
120
105
 
121
106
  puts 'done!'if options.debug
122
107
 
123
- triples
108
+ triples.uniq
124
109
  end
125
110
 
126
111
  def request args={}
@@ -139,7 +124,7 @@ module Scrappy
139
124
  print "Serializing..."; $stdout.flush
140
125
  end
141
126
 
142
- output = response.serialize request[:format], @options.format_header
127
+ output = response.serialize request[:format], options.format_header
143
128
 
144
129
  puts 'done!'if options.debug
145
130
 
@@ -152,14 +137,106 @@ module Scrappy
152
137
  :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
153
138
  end
154
139
 
140
+ # Method to observe several webs, and extract the data periodically
141
+ def observe uris
142
+ while true
143
+ time_init = Time.now.to_i
144
+ uris.each do |uri|
145
+ puts "Pinging #{uri}..."
146
+ request :uri=>uri
147
+ end
148
+ time = options.repository.time * 60 - (Time.now.to_i - time_init)
149
+ puts "Sleeping until #{Time.now + time}..."
150
+ sleep time
151
+ end
152
+ end
153
+
154
+ private
155
155
  def complete_uri uri
156
156
  uri = "#{uri}.com" if uri =~ /\A\w+\Z/
157
- uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
157
+ uri = "http://#{uri}" unless uri =~ /\A\w*:/
158
158
  uri
159
159
  end
160
-
160
+
161
161
  def clean triples
162
- triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
162
+ triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
163
+ end
164
+
165
+ # Do the extraction using RDF repository
166
+ def request_from_repository request
167
+ triples = []
168
+
169
+ # Checks if there is any previous extraction within the last 15 minutes
170
+ contexts = if Options.time
171
+ @repository.recent_contexts(request[:uri], Options.time)
172
+ else
173
+ @repository.recent_contexts(request[:uri])
174
+ end
175
+
176
+ if contexts.empty?
177
+ # Extracts data from the uri
178
+ triples = request_uncached request
179
+
180
+ if options.debug
181
+ print "Storing into repository #{request[:uri]}..."; $stdout.flush
182
+ end
183
+
184
+ # Checks if the extraction returned something
185
+ graph = if triples.empty?
186
+ # Creates a triple to indicate that nothing was extracted from the uri
187
+ # This is done because otherwise the context wouldn't be stored
188
+ RDF::Graph.new [ [ID(request[:uri]), ID("sc:extraction"), ID("sc:Empty")] ]
189
+ else
190
+ RDF::Graph.new triples.uniq
191
+ end
192
+
193
+ # Adds data to sesame
194
+ @repository.data = graph, "#{request[:uri]}:#{Time.now.to_i}"
195
+ @repository.data = graph, "#{self.uri}:#{Time.now.to_i}" if self.uri
196
+
197
+ puts 'done!' if options.debug
198
+
199
+ triples
200
+ else
201
+ # Data found in repository. Asking for it
202
+ triples = []
203
+ if options.debug
204
+ print "Retrieving from repository #{request[:uri]}..."; $stdout.flush
205
+ end
206
+ contexts.each do |context|
207
+ graph = @repository.data(context)
208
+ triples += graph.triples.select{|s,p,o| p!=ID("sc:extraction")}
209
+ end
210
+ puts 'done!' if options.debug
211
+
212
+ triples
213
+ end
214
+ end
215
+
216
+ # Extracts from the uri
217
+ def request_uncached request
218
+ sleep 0.001 * options.delay.to_f # Sleep if requested
219
+
220
+ if options.debug
221
+ print "Opening #{request[:uri]}..."; $stdout.flush
222
+ end
223
+
224
+ if request[:method] == :get
225
+ self.uri = request[:uri]
226
+ else
227
+ raise Exception, 'POST requests not supported yet'
228
+ end
229
+
230
+ puts 'done!' if options.debug
231
+
232
+ if self.html_data?
233
+ add_visual_data! if options.referenceable # Adds tags including visual information
234
+ triples = extract(self.uri, html, options.referenceable) # Extract data
235
+ Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
236
+ triples
237
+ else
238
+ []
239
+ end
163
240
  end
164
241
  end
165
242
  end
@@ -6,7 +6,7 @@ module Scrappy
6
6
  if options.debug
7
7
  print "Extracting #{uri}..."; $stdout.flush
8
8
  end
9
-
9
+
10
10
  @selector_pool ||= {}
11
11
  triples = []
12
12
  content = Nokogiri::HTML(html, nil, 'utf-8')
@@ -27,7 +27,11 @@ module Scrappy
27
27
 
28
28
  puts "done!" if options.debug
29
29
 
30
- triples
30
+ triples.map do |s,p,o|
31
+ [ s.is_a?(RDF::Node) ? s.id : s,
32
+ p.is_a?(RDF::Node) ? p.id : p,
33
+ o.is_a?(RDF::Node) ? o.id : o ]
34
+ end
31
35
  end
32
36
 
33
37
  private
@@ -150,32 +154,32 @@ module Scrappy
150
154
  triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
151
155
 
152
156
  content.search('*').each do |node|
157
+ next if node.text?
158
+
153
159
  fragment = Node(node_hash(uri, node.path))
154
-
160
+
155
161
  if referenceable == :dump or resources[fragment]
156
- selector = Node(nil)
162
+ selector = Node(nil)
157
163
  presentation = Node(nil)
158
164
 
159
- selector.rdf::type = Node('sc:UnivocalSelector')
160
- selector.sc::path = node.path.to_s
161
- selector.sc::tag = node.name.to_s
162
- selector.sc::document = uri
163
-
164
- presentation.sc::x = node[:vx].to_s if node[:vx]
165
- presentation.sc::y = node[:vy].to_s if node[:vy]
166
- presentation.sc::width = node[:vw].to_s if node[:vw]
167
- presentation.sc::height = node[:vh].to_s if node[:vh]
168
- presentation.sc::font_size = node[:vsize].gsub("px","").to_s if node[:vsize]
169
- presentation.sc::font_weight = node[:vweight].to_s if node[:vweight]
170
- presentation.sc::color = node[:vcolor].to_s if node[:vcolor]
171
- presentation.sc::background_color = node[:vbcolor].to_s if node[:vbcolor]
172
- presentation.sc::text = node.text.strip
173
- presentation.sc::children_count = node.search('*').size.to_s
174
-
175
- fragment.sc::selector = selector
176
- fragment.sc::presentation = presentation unless presentation.empty?
177
-
178
- triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples)
165
+ triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
166
+ triples << [selector, ID('sc:path'), node.path.to_s]
167
+ triples << [selector, ID('sc:tag'), node.name.to_s]
168
+ triples << [selector, ID('sc:document'), uri]
169
+
170
+ triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
171
+ triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
172
+ triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
173
+ triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
174
+ triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
175
+ triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
176
+ triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
177
+ triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
178
+ triples << [presentation, ID('sc:text'), node.text.strip]
179
+ triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
180
+
181
+ triples << [fragment, ID('sc:selector'), selector]
182
+ triples << [fragment, ID('sc:presentation'), presentation]
179
183
  end
180
184
  end
181
185
  end
@@ -0,0 +1,34 @@
1
+ module Scrappy
2
+ class Repository < RDF::Repository
3
+ # Processes the list of context, checks if there is any extraction
4
+ # from the last X minutes, and returns an array with them.
5
+ # If there is not any extraction, returns an empty array
6
+ def recent_contexts uri, seconds=@options[:time].to_i*60
7
+ return [] unless uri
8
+ contexts.select do |context|
9
+ date = context_date(context)
10
+ date and check_date(date, seconds) and context_uri(context) == uri
11
+ end
12
+ end
13
+
14
+ def time
15
+ @options[:time]
16
+ end
17
+
18
+ protected
19
+ # Checks if the context date is within the indicated time
20
+ def check_date date, seconds
21
+ (Time.now.to_i - date) <= seconds
22
+ end
23
+
24
+ # Returns an integer with the date of a given context
25
+ def context_date context
26
+ $1.to_i if context =~ /:(\d+)\Z/
27
+ end
28
+
29
+ # Returns the URI of a context
30
+ def context_uri context
31
+ $1 if context =~ /\A(.*):(\d+)\Z/
32
+ end
33
+ end
34
+ end
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.24"
5
+ s.version = "0.2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-03-08}
9
+ s.date = %q{2011-03-09}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/repository.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/repository.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
@@ -32,8 +32,9 @@ Gem::Specification.new do |s|
32
32
  s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
33
33
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
34
34
  s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
35
- s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.0"])
35
+ s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.1"])
36
36
  s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
37
+ s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
37
38
  s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
38
39
  else
39
40
  s.add_dependency(%q<activesupport>, [">= 2.3.5"])
@@ -41,8 +42,9 @@ Gem::Specification.new do |s|
41
42
  s.add_dependency(%q<thin>, [">= 1.2.7"])
42
43
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
43
44
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
44
- s.add_dependency(%q<lightrdf>, [">= 0.2.0"])
45
+ s.add_dependency(%q<lightrdf>, [">= 0.2.1"])
45
46
  s.add_dependency(%q<i18n>, [">= 0.4.2"])
47
+ s.add_dependency(%q<rest-client>, [">= 1.6.1"])
46
48
  s.add_dependency(%q<haml>, [">= 3.0.24"])
47
49
  end
48
50
  else
@@ -51,8 +53,9 @@ Gem::Specification.new do |s|
51
53
  s.add_dependency(%q<thin>, [">= 1.2.7"])
52
54
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
53
55
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
54
- s.add_dependency(%q<lightrdf>, [">= 0.2.0"])
56
+ s.add_dependency(%q<lightrdf>, [">= 0.2.1"])
55
57
  s.add_dependency(%q<i18n>, [">= 0.4.2"])
58
+ s.add_dependency(%q<rest-client>, [">= 1.6.1"])
56
59
  s.add_dependency(%q<haml>, [">= 3.0.24"])
57
60
  end
58
61
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 24
9
- version: 0.1.24
7
+ - 2
8
+ - 0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-08 00:00:00 +01:00
17
+ date: 2011-03-09 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -97,8 +97,8 @@ dependencies:
97
97
  segments:
98
98
  - 0
99
99
  - 2
100
- - 0
101
- version: 0.2.0
100
+ - 1
101
+ version: 0.2.1
102
102
  type: :runtime
103
103
  version_requirements: *id006
104
104
  - !ruby/object:Gem::Dependency
@@ -116,9 +116,23 @@ dependencies:
116
116
  type: :runtime
117
117
  version_requirements: *id007
118
118
  - !ruby/object:Gem::Dependency
119
- name: haml
119
+ name: rest-client
120
120
  prerelease: false
121
121
  requirement: &id008 !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ segments:
126
+ - 1
127
+ - 6
128
+ - 1
129
+ version: 1.6.1
130
+ type: :runtime
131
+ version_requirements: *id008
132
+ - !ruby/object:Gem::Dependency
133
+ name: haml
134
+ prerelease: false
135
+ requirement: &id009 !ruby/object:Gem::Requirement
122
136
  requirements:
123
137
  - - ">="
124
138
  - !ruby/object:Gem::Version
@@ -128,7 +142,7 @@ dependencies:
128
142
  - 24
129
143
  version: 3.0.24
130
144
  type: :runtime
131
- version_requirements: *id008
145
+ version_requirements: *id009
132
146
  description: RDF web scraper
133
147
  email: joseignacio.fernandez@gmail.com
134
148
  executables:
@@ -164,6 +178,7 @@ extra_rdoc_files:
164
178
  - lib/scrappy/server/public/stylesheets/application.css
165
179
  - lib/scrappy/server/views/home.haml
166
180
  - lib/scrappy/server/views/help.haml
181
+ - lib/scrappy/repository.rb
167
182
  - lib/scrappy/shell.rb
168
183
  - lib/scrappy/support.rb
169
184
  - lib/scrappy/webkit/webkit.rb
@@ -200,6 +215,7 @@ files:
200
215
  - lib/scrappy/server/public/stylesheets/application.css
201
216
  - lib/scrappy/server/views/home.haml
202
217
  - lib/scrappy/server/views/help.haml
218
+ - lib/scrappy/repository.rb
203
219
  - lib/scrappy/shell.rb
204
220
  - lib/scrappy/support.rb
205
221
  - lib/scrappy/webkit/webkit.rb