scrappy 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/History.txt +6 -0
  2. data/Manifest +21 -14
  3. data/README.rdoc +5 -9
  4. data/Rakefile +1 -2
  5. data/bin/scrappy +141 -51
  6. data/lib/scrappy.rb +6 -9
  7. data/lib/scrappy/agent/agent.rb +3 -3
  8. data/lib/scrappy/extractor/extractor.rb +108 -0
  9. data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
  10. data/lib/scrappy/extractor/fragment.rb +111 -0
  11. data/lib/scrappy/extractor/selector.rb +41 -0
  12. data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
  13. data/lib/scrappy/extractor/selectors/css.rb +5 -0
  14. data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
  15. data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
  16. data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
  17. data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
  18. data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
  19. data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
  20. data/lib/scrappy/extractor/selectors/visual.rb +39 -0
  21. data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
  22. data/lib/scrappy/server/admin.rb +89 -2
  23. data/lib/scrappy/server/helpers.rb +11 -2
  24. data/lib/scrappy/server/server.rb +1 -0
  25. data/lib/scrappy/trainer/trainer.rb +101 -0
  26. data/public/javascripts/annotator.js +75 -0
  27. data/public/javascripts/remote.js +132 -0
  28. data/public/stylesheets/application.css +39 -12
  29. data/scrappy.gemspec +13 -11
  30. data/views/extractors.haml +24 -0
  31. data/views/layout.haml +14 -4
  32. data/views/patterns.haml +19 -0
  33. data/views/samples.haml +28 -0
  34. metadata +58 -56
  35. data/lib/scrappy/agent/extractor.rb +0 -196
  36. data/lib/scrappy/selectors/css.rb +0 -10
  37. data/public/javascripts/scrappy.js +0 -65
  38. data/views/kb.haml +0 -15
@@ -0,0 +1,28 @@
1
+ #body
2
+ %h1 Sample pages
3
+ %p
4
+ Sample pages are used to build extractors as well as visual patterns that can be applied to retrieve data
5
+ from other pages.
6
+ %p
7
+ -if @samples.empty?
8
+ Currently, there are no samples.
9
+ -else
10
+ %ul.detail
11
+ -@samples.each_with_index do |sample,i|
12
+ %li
13
+ %span.action
14
+ %a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
15
+ X
16
+ %span.short_name
17
+ -if !sample[:uri].include?('*')
18
+ %a{:href=>sample[:uri]}=sample[:uri]
19
+ -else
20
+ =sample[:uri]
21
+ -[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
22
+ %span.format
23
+ %a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
24
+ %span.format
25
+ %a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
26
+ %span.date
27
+ %a{:href=>"#{settings.base_uri}/samples/#{i}"}
28
+ =sample[:date].strftime("%Y/%m/%d - %H:%M")
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 3
9
- - 0
10
- version: 0.3.0
8
+ - 1
9
+ version: 0.3.1
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jose Ignacio
@@ -15,18 +14,16 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-03-11 00:00:00 +01:00
17
+ date: 2011-03-18 00:00:00 +01:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: activesupport
23
22
  prerelease: false
24
23
  requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
24
  requirements:
27
25
  - - ">="
28
26
  - !ruby/object:Gem::Version
29
- hash: 9
30
27
  segments:
31
28
  - 2
32
29
  - 3
@@ -38,11 +35,9 @@ dependencies:
38
35
  name: sinatra
39
36
  prerelease: false
40
37
  requirement: &id002 !ruby/object:Gem::Requirement
41
- none: false
42
38
  requirements:
43
39
  - - ">="
44
40
  - !ruby/object:Gem::Version
45
- hash: 23
46
41
  segments:
47
42
  - 1
48
43
  - 1
@@ -54,11 +49,9 @@ dependencies:
54
49
  name: thin
55
50
  prerelease: false
56
51
  requirement: &id003 !ruby/object:Gem::Requirement
57
- none: false
58
52
  requirements:
59
53
  - - ">="
60
54
  - !ruby/object:Gem::Version
61
- hash: 17
62
55
  segments:
63
56
  - 1
64
57
  - 2
@@ -70,11 +63,9 @@ dependencies:
70
63
  name: nokogiri
71
64
  prerelease: false
72
65
  requirement: &id004 !ruby/object:Gem::Requirement
73
- none: false
74
66
  requirements:
75
67
  - - ">="
76
68
  - !ruby/object:Gem::Version
77
- hash: 5
78
69
  segments:
79
70
  - 1
80
71
  - 4
@@ -86,11 +77,9 @@ dependencies:
86
77
  name: mechanize
87
78
  prerelease: false
88
79
  requirement: &id005 !ruby/object:Gem::Requirement
89
- none: false
90
80
  requirements:
91
81
  - - ">="
92
82
  - !ruby/object:Gem::Version
93
- hash: 23
94
83
  segments:
95
84
  - 1
96
85
  - 0
@@ -102,27 +91,23 @@ dependencies:
102
91
  name: lightrdf
103
92
  prerelease: false
104
93
  requirement: &id006 !ruby/object:Gem::Requirement
105
- none: false
106
94
  requirements:
107
95
  - - ">="
108
96
  - !ruby/object:Gem::Version
109
- hash: 21
110
97
  segments:
111
98
  - 0
112
- - 2
113
- - 1
114
- version: 0.2.1
99
+ - 3
100
+ - 0
101
+ version: 0.3.0
115
102
  type: :runtime
116
103
  version_requirements: *id006
117
104
  - !ruby/object:Gem::Dependency
118
105
  name: i18n
119
106
  prerelease: false
120
107
  requirement: &id007 !ruby/object:Gem::Requirement
121
- none: false
122
108
  requirements:
123
109
  - - ">="
124
110
  - !ruby/object:Gem::Version
125
- hash: 11
126
111
  segments:
127
112
  - 0
128
113
  - 4
@@ -134,11 +119,9 @@ dependencies:
134
119
  name: rest-client
135
120
  prerelease: false
136
121
  requirement: &id008 !ruby/object:Gem::Requirement
137
- none: false
138
122
  requirements:
139
123
  - - ">="
140
124
  - !ruby/object:Gem::Version
141
- hash: 13
142
125
  segments:
143
126
  - 1
144
127
  - 6
@@ -150,11 +133,9 @@ dependencies:
150
133
  name: haml
151
134
  prerelease: false
152
135
  requirement: &id009 !ruby/object:Gem::Requirement
153
- none: false
154
136
  requirements:
155
137
  - - ">="
156
138
  - !ruby/object:Gem::Version
157
- hash: 55
158
139
  segments:
159
140
  - 3
160
141
  - 0
@@ -162,6 +143,20 @@ dependencies:
162
143
  version: 3.0.24
163
144
  type: :runtime
164
145
  version_requirements: *id009
146
+ - !ruby/object:Gem::Dependency
147
+ name: rack-flash
148
+ prerelease: false
149
+ requirement: &id010 !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ segments:
154
+ - 0
155
+ - 1
156
+ - 1
157
+ version: 0.1.1
158
+ type: :runtime
159
+ version_requirements: *id010
165
160
  description: RDF web scraper
166
161
  email: joseignacio.fernandez@gmail.com
167
162
  executables:
@@ -176,24 +171,28 @@ extra_rdoc_files:
176
171
  - lib/scrappy/agent/blind_agent.rb
177
172
  - lib/scrappy/agent/cache.rb
178
173
  - lib/scrappy/agent/dumper.rb
179
- - lib/scrappy/agent/extractor.rb
180
- - lib/scrappy/agent/formats.rb
181
174
  - lib/scrappy/agent/map_reduce.rb
175
+ - lib/scrappy/extractor/extractor.rb
176
+ - lib/scrappy/extractor/formats.rb
177
+ - lib/scrappy/extractor/fragment.rb
178
+ - lib/scrappy/extractor/selector.rb
179
+ - lib/scrappy/extractor/selectors/base_uri.rb
180
+ - lib/scrappy/extractor/selectors/css.rb
181
+ - lib/scrappy/extractor/selectors/new_uri.rb
182
+ - lib/scrappy/extractor/selectors/root.rb
183
+ - lib/scrappy/extractor/selectors/section.rb
184
+ - lib/scrappy/extractor/selectors/slice.rb
185
+ - lib/scrappy/extractor/selectors/uri.rb
186
+ - lib/scrappy/extractor/selectors/uri_pattern.rb
187
+ - lib/scrappy/extractor/selectors/visual.rb
188
+ - lib/scrappy/extractor/selectors/xpath.rb
182
189
  - lib/scrappy/repository.rb
183
- - lib/scrappy/selectors/base_uri.rb
184
- - lib/scrappy/selectors/css.rb
185
- - lib/scrappy/selectors/new_uri.rb
186
- - lib/scrappy/selectors/root.rb
187
- - lib/scrappy/selectors/section.rb
188
- - lib/scrappy/selectors/slice.rb
189
- - lib/scrappy/selectors/uri.rb
190
- - lib/scrappy/selectors/uri_pattern.rb
191
- - lib/scrappy/selectors/xpath.rb
192
190
  - lib/scrappy/server/admin.rb
193
191
  - lib/scrappy/server/errors.rb
194
192
  - lib/scrappy/server/helpers.rb
195
193
  - lib/scrappy/server/server.rb
196
194
  - lib/scrappy/support.rb
195
+ - lib/scrappy/trainer/trainer.rb
197
196
  files:
198
197
  - History.txt
199
198
  - Manifest
@@ -206,41 +205,48 @@ files:
206
205
  - lib/scrappy/agent/blind_agent.rb
207
206
  - lib/scrappy/agent/cache.rb
208
207
  - lib/scrappy/agent/dumper.rb
209
- - lib/scrappy/agent/extractor.rb
210
- - lib/scrappy/agent/formats.rb
211
208
  - lib/scrappy/agent/map_reduce.rb
209
+ - lib/scrappy/extractor/extractor.rb
210
+ - lib/scrappy/extractor/formats.rb
211
+ - lib/scrappy/extractor/fragment.rb
212
+ - lib/scrappy/extractor/selector.rb
213
+ - lib/scrappy/extractor/selectors/base_uri.rb
214
+ - lib/scrappy/extractor/selectors/css.rb
215
+ - lib/scrappy/extractor/selectors/new_uri.rb
216
+ - lib/scrappy/extractor/selectors/root.rb
217
+ - lib/scrappy/extractor/selectors/section.rb
218
+ - lib/scrappy/extractor/selectors/slice.rb
219
+ - lib/scrappy/extractor/selectors/uri.rb
220
+ - lib/scrappy/extractor/selectors/uri_pattern.rb
221
+ - lib/scrappy/extractor/selectors/visual.rb
222
+ - lib/scrappy/extractor/selectors/xpath.rb
212
223
  - lib/scrappy/repository.rb
213
- - lib/scrappy/selectors/base_uri.rb
214
- - lib/scrappy/selectors/css.rb
215
- - lib/scrappy/selectors/new_uri.rb
216
- - lib/scrappy/selectors/root.rb
217
- - lib/scrappy/selectors/section.rb
218
- - lib/scrappy/selectors/slice.rb
219
- - lib/scrappy/selectors/uri.rb
220
- - lib/scrappy/selectors/uri_pattern.rb
221
- - lib/scrappy/selectors/xpath.rb
222
224
  - lib/scrappy/server/admin.rb
223
225
  - lib/scrappy/server/errors.rb
224
226
  - lib/scrappy/server/helpers.rb
225
227
  - lib/scrappy/server/server.rb
226
228
  - lib/scrappy/support.rb
229
+ - lib/scrappy/trainer/trainer.rb
227
230
  - public/favicon.ico
228
231
  - public/images/logo.png
229
232
  - public/images/logo_tiny.png
230
- - public/javascripts/scrappy.js
233
+ - public/javascripts/annotator.js
234
+ - public/javascripts/remote.js
231
235
  - public/stylesheets/application.css
232
236
  - test/test_helper.rb
233
237
  - test/test_scrappy.rb
238
+ - views/extractors.haml
234
239
  - views/help.haml
235
240
  - views/home.haml
236
- - views/kb.haml
237
241
  - views/layout.haml
242
+ - views/patterns.haml
243
+ - views/samples.haml
238
244
  - scrappy.gemspec
239
245
  has_rdoc: true
240
246
  homepage: http://github.com/josei/scrappy
241
247
  licenses: []
242
248
 
243
- post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
249
+ post_install_message:
244
250
  rdoc_options:
245
251
  - --line-numbers
246
252
  - --inline-source
@@ -251,20 +257,16 @@ rdoc_options:
251
257
  require_paths:
252
258
  - lib
253
259
  required_ruby_version: !ruby/object:Gem::Requirement
254
- none: false
255
260
  requirements:
256
261
  - - ">="
257
262
  - !ruby/object:Gem::Version
258
- hash: 3
259
263
  segments:
260
264
  - 0
261
265
  version: "0"
262
266
  required_rubygems_version: !ruby/object:Gem::Requirement
263
- none: false
264
267
  requirements:
265
268
  - - ">="
266
269
  - !ruby/object:Gem::Version
267
- hash: 11
268
270
  segments:
269
271
  - 1
270
272
  - 2
@@ -272,10 +274,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
272
274
  requirements: []
273
275
 
274
276
  rubyforge_project: scrappy
275
- rubygems_version: 1.3.7
277
+ rubygems_version: 1.3.6
276
278
  signing_key:
277
279
  specification_version: 3
278
280
  summary: Web scraper that allows producing RDF data out of plain web pages
279
281
  test_files:
280
- - test/test_helper.rb
281
282
  - test/test_scrappy.rb
283
+ - test/test_helper.rb
@@ -1,196 +0,0 @@
1
- require 'digest/md5'
2
-
3
- module Scrappy
4
- module Extractor
5
- def extract uri, html, referenceable=nil
6
- if options.debug
7
- print "Extracting #{uri}..."; $stdout.flush
8
- end
9
-
10
- @selector_pool ||= {}
11
- triples = []
12
- content = Nokogiri::HTML(html, nil, 'utf-8')
13
-
14
- uri_selectors = (kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).flatten.select do |uri_selector|
15
- results = selector_pool(uri_selector).filter :content=>content, :uri=>uri
16
- !results.empty?
17
- end
18
-
19
- fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
20
-
21
- fragments.each do |fragment|
22
- extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
23
- :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
24
- end
25
-
26
- add_referenceable_data content, triples, referenceable if referenceable
27
-
28
- puts "done!" if options.debug
29
-
30
- triples.map do |s,p,o|
31
- [ s.is_a?(RDF::Node) ? s.id : s,
32
- p.is_a?(RDF::Node) ? p.id : p,
33
- o.is_a?(RDF::Node) ? o.id : o ]
34
- end
35
- end
36
-
37
- private
38
- def extract_fragment fragment, options={}
39
- node = Node(options[:parent])
40
- uri = options[:doc][:uri]
41
-
42
- # Select nodes
43
- docs = fragment.sc::selector.map { |s| filter s, options[:doc] }.flatten
44
-
45
- # Generate triples
46
- docs.each do |doc|
47
- # Build URIs if identifier present
48
- nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map do |d|
49
- node = Node(parse_uri(uri, d[:value]))
50
- if options[:referenceable]
51
- # Include the fragment where the URI was built from
52
- uri_node = Node(nil)
53
- options[:triples] << [ node, Node("sc:uri"), uri_node ]
54
- options[:triples] << [ uri_node, Node("rdf:value"), node.to_s ]
55
- options[:triples] << [ uri_node, Node("sc:source"), Node(node_hash(d[:uri], d[:content].path)) ]
56
- end
57
- node
58
- end
59
- nodes << Node(nil) if nodes.empty?
60
-
61
- nodes.each do |node|
62
- # Build the object
63
- object = if fragment.sc::type.include?(Node('rdf:Literal'))
64
- value = doc[:value].to_s.strip
65
- if options[:referenceable]
66
- bnode = Node(nil)
67
- bnode.rdf::value = value
68
- bnode.rdf::type = Node('rdf:Literal')
69
- options[:triples].push *bnode.triples
70
- bnode
71
- else
72
- value
73
- end
74
- else
75
- fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
76
- fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
77
- fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
78
- node
79
- end
80
- fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
81
-
82
- # Add referenceable data if requested
83
- if options[:referenceable]
84
- sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
85
- sources.each do |source|
86
- options[:triples] << [ object, Node("sc:source"), source ]
87
- fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
88
- fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
89
- end
90
- end
91
-
92
- # Process subfragments
93
- fragment.sc::subfragment.each { |subfragment| extract_fragment subfragment, options.merge(:doc=>doc, :parent=>object) }
94
- end
95
- end
96
- end
97
-
98
- def filter selector, doc
99
- if selector.sc::debug.first=="true" and options.debug
100
- puts '== DEBUG'
101
- puts '== Selector:'
102
- puts selector.serialize(:yarf, false)
103
- puts '== On fragment:'
104
- puts "URI: #{doc[:uri]}"
105
- puts "Content: #{doc[:content]}"
106
- puts "Value: #{doc[:value]}"
107
- end
108
-
109
- # Process selector
110
- results = selector_pool(selector).filter doc
111
-
112
- if selector.sc::debug.first=="true" and options.debug
113
- puts "== No results" if results.empty?
114
- results.each_with_index do |result, i|
115
- puts "== Result ##{i}:"
116
- puts "URI: #{result[:uri]}"
117
- puts "Content: #{result[:content]}"
118
- puts "Value: #{result[:value].inspect}"
119
- end
120
- puts
121
- end
122
-
123
- # Return results if no nested selectors
124
- return results if selector.sc::selector.empty?
125
-
126
- # Process nested selectors
127
- results.map do |result|
128
- selector.sc::selector.map { |s| filter s, result }
129
- end.flatten
130
- end
131
-
132
- def parse_uri(uri, rel_uri)
133
- return ID('*') if rel_uri.nil?
134
- begin
135
- ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri).to_s)
136
- rescue
137
- ID('*')
138
- end
139
- end
140
-
141
- def add_referenceable_data content, triples, referenceable
142
- resources = {}; triples.each { |s,p,o| resources[o] = true }
143
-
144
- fragment = Node(node_hash(uri, '/'))
145
- selector = Node(nil)
146
- presentation = Node(nil)
147
-
148
- selector.rdf::type = Node('sc:UnivocalSelector')
149
- selector.sc::path = '/'
150
- selector.sc::document = uri
151
-
152
- fragment.sc::selector = selector
153
-
154
- triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
155
-
156
- content.search('*').each do |node|
157
- next if node.text?
158
-
159
- fragment = Node(node_hash(uri, node.path))
160
-
161
- if referenceable == :dump or resources[fragment]
162
- selector = Node(nil)
163
- presentation = Node(nil)
164
-
165
- triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
166
- triples << [selector, ID('sc:path'), node.path.to_s]
167
- triples << [selector, ID('sc:tag'), node.name.to_s]
168
- triples << [selector, ID('sc:document'), uri]
169
-
170
- triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
171
- triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
172
- triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
173
- triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
174
- triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
175
- triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
176
- triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
177
- triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
178
- triples << [presentation, ID('sc:text'), node.text.strip]
179
- triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
180
-
181
- triples << [fragment, ID('sc:selector'), selector]
182
- triples << [fragment, ID('sc:presentation'), presentation]
183
- end
184
- end
185
- end
186
-
187
- def node_hash uri, path
188
- digest = Digest::MD5.hexdigest("#{uri} #{path}")
189
- :"_:bnode#{digest}"
190
- end
191
-
192
- def selector_pool selector
193
- @selector_pool[selector.id] ||= kb.node(selector)
194
- end
195
- end
196
- end