scrappy 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/History.txt +6 -0
  2. data/Manifest +21 -14
  3. data/README.rdoc +5 -9
  4. data/Rakefile +1 -2
  5. data/bin/scrappy +141 -51
  6. data/lib/scrappy.rb +6 -9
  7. data/lib/scrappy/agent/agent.rb +3 -3
  8. data/lib/scrappy/extractor/extractor.rb +108 -0
  9. data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
  10. data/lib/scrappy/extractor/fragment.rb +111 -0
  11. data/lib/scrappy/extractor/selector.rb +41 -0
  12. data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
  13. data/lib/scrappy/extractor/selectors/css.rb +5 -0
  14. data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
  15. data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
  16. data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
  17. data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
  18. data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
  19. data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
  20. data/lib/scrappy/extractor/selectors/visual.rb +39 -0
  21. data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
  22. data/lib/scrappy/server/admin.rb +89 -2
  23. data/lib/scrappy/server/helpers.rb +11 -2
  24. data/lib/scrappy/server/server.rb +1 -0
  25. data/lib/scrappy/trainer/trainer.rb +101 -0
  26. data/public/javascripts/annotator.js +75 -0
  27. data/public/javascripts/remote.js +132 -0
  28. data/public/stylesheets/application.css +39 -12
  29. data/scrappy.gemspec +13 -11
  30. data/views/extractors.haml +24 -0
  31. data/views/layout.haml +14 -4
  32. data/views/patterns.haml +19 -0
  33. data/views/samples.haml +28 -0
  34. metadata +58 -56
  35. data/lib/scrappy/agent/extractor.rb +0 -196
  36. data/lib/scrappy/selectors/css.rb +0 -10
  37. data/public/javascripts/scrappy.js +0 -65
  38. data/views/kb.haml +0 -15
@@ -0,0 +1,28 @@
1
+ #body
2
+ %h1 Sample pages
3
+ %p
4
+ Sample pages are used to build extractors as well as visual patterns that can be applied to retrieve data
5
+ from other pages.
6
+ %p
7
+ -if @samples.empty?
8
+ Currently, there are no samples.
9
+ -else
10
+ %ul.detail
11
+ -@samples.each_with_index do |sample,i|
12
+ %li
13
+ %span.action
14
+ %a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
15
+ X
16
+ %span.short_name
17
+ -if !sample[:uri].include?('*')
18
+ %a{:href=>sample[:uri]}=sample[:uri]
19
+ -else
20
+ =sample[:uri]
21
+ -[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
22
+ %span.format
23
+ %a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
24
+ %span.format
25
+ %a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
26
+ %span.date
27
+ %a{:href=>"#{settings.base_uri}/samples/#{i}"}
28
+ =sample[:date].strftime("%Y/%m/%d - %H:%M")
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 3
9
- - 0
10
- version: 0.3.0
8
+ - 1
9
+ version: 0.3.1
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jose Ignacio
@@ -15,18 +14,16 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-03-11 00:00:00 +01:00
17
+ date: 2011-03-18 00:00:00 +01:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: activesupport
23
22
  prerelease: false
24
23
  requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
24
  requirements:
27
25
  - - ">="
28
26
  - !ruby/object:Gem::Version
29
- hash: 9
30
27
  segments:
31
28
  - 2
32
29
  - 3
@@ -38,11 +35,9 @@ dependencies:
38
35
  name: sinatra
39
36
  prerelease: false
40
37
  requirement: &id002 !ruby/object:Gem::Requirement
41
- none: false
42
38
  requirements:
43
39
  - - ">="
44
40
  - !ruby/object:Gem::Version
45
- hash: 23
46
41
  segments:
47
42
  - 1
48
43
  - 1
@@ -54,11 +49,9 @@ dependencies:
54
49
  name: thin
55
50
  prerelease: false
56
51
  requirement: &id003 !ruby/object:Gem::Requirement
57
- none: false
58
52
  requirements:
59
53
  - - ">="
60
54
  - !ruby/object:Gem::Version
61
- hash: 17
62
55
  segments:
63
56
  - 1
64
57
  - 2
@@ -70,11 +63,9 @@ dependencies:
70
63
  name: nokogiri
71
64
  prerelease: false
72
65
  requirement: &id004 !ruby/object:Gem::Requirement
73
- none: false
74
66
  requirements:
75
67
  - - ">="
76
68
  - !ruby/object:Gem::Version
77
- hash: 5
78
69
  segments:
79
70
  - 1
80
71
  - 4
@@ -86,11 +77,9 @@ dependencies:
86
77
  name: mechanize
87
78
  prerelease: false
88
79
  requirement: &id005 !ruby/object:Gem::Requirement
89
- none: false
90
80
  requirements:
91
81
  - - ">="
92
82
  - !ruby/object:Gem::Version
93
- hash: 23
94
83
  segments:
95
84
  - 1
96
85
  - 0
@@ -102,27 +91,23 @@ dependencies:
102
91
  name: lightrdf
103
92
  prerelease: false
104
93
  requirement: &id006 !ruby/object:Gem::Requirement
105
- none: false
106
94
  requirements:
107
95
  - - ">="
108
96
  - !ruby/object:Gem::Version
109
- hash: 21
110
97
  segments:
111
98
  - 0
112
- - 2
113
- - 1
114
- version: 0.2.1
99
+ - 3
100
+ - 0
101
+ version: 0.3.0
115
102
  type: :runtime
116
103
  version_requirements: *id006
117
104
  - !ruby/object:Gem::Dependency
118
105
  name: i18n
119
106
  prerelease: false
120
107
  requirement: &id007 !ruby/object:Gem::Requirement
121
- none: false
122
108
  requirements:
123
109
  - - ">="
124
110
  - !ruby/object:Gem::Version
125
- hash: 11
126
111
  segments:
127
112
  - 0
128
113
  - 4
@@ -134,11 +119,9 @@ dependencies:
134
119
  name: rest-client
135
120
  prerelease: false
136
121
  requirement: &id008 !ruby/object:Gem::Requirement
137
- none: false
138
122
  requirements:
139
123
  - - ">="
140
124
  - !ruby/object:Gem::Version
141
- hash: 13
142
125
  segments:
143
126
  - 1
144
127
  - 6
@@ -150,11 +133,9 @@ dependencies:
150
133
  name: haml
151
134
  prerelease: false
152
135
  requirement: &id009 !ruby/object:Gem::Requirement
153
- none: false
154
136
  requirements:
155
137
  - - ">="
156
138
  - !ruby/object:Gem::Version
157
- hash: 55
158
139
  segments:
159
140
  - 3
160
141
  - 0
@@ -162,6 +143,20 @@ dependencies:
162
143
  version: 3.0.24
163
144
  type: :runtime
164
145
  version_requirements: *id009
146
+ - !ruby/object:Gem::Dependency
147
+ name: rack-flash
148
+ prerelease: false
149
+ requirement: &id010 !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ segments:
154
+ - 0
155
+ - 1
156
+ - 1
157
+ version: 0.1.1
158
+ type: :runtime
159
+ version_requirements: *id010
165
160
  description: RDF web scraper
166
161
  email: joseignacio.fernandez@gmail.com
167
162
  executables:
@@ -176,24 +171,28 @@ extra_rdoc_files:
176
171
  - lib/scrappy/agent/blind_agent.rb
177
172
  - lib/scrappy/agent/cache.rb
178
173
  - lib/scrappy/agent/dumper.rb
179
- - lib/scrappy/agent/extractor.rb
180
- - lib/scrappy/agent/formats.rb
181
174
  - lib/scrappy/agent/map_reduce.rb
175
+ - lib/scrappy/extractor/extractor.rb
176
+ - lib/scrappy/extractor/formats.rb
177
+ - lib/scrappy/extractor/fragment.rb
178
+ - lib/scrappy/extractor/selector.rb
179
+ - lib/scrappy/extractor/selectors/base_uri.rb
180
+ - lib/scrappy/extractor/selectors/css.rb
181
+ - lib/scrappy/extractor/selectors/new_uri.rb
182
+ - lib/scrappy/extractor/selectors/root.rb
183
+ - lib/scrappy/extractor/selectors/section.rb
184
+ - lib/scrappy/extractor/selectors/slice.rb
185
+ - lib/scrappy/extractor/selectors/uri.rb
186
+ - lib/scrappy/extractor/selectors/uri_pattern.rb
187
+ - lib/scrappy/extractor/selectors/visual.rb
188
+ - lib/scrappy/extractor/selectors/xpath.rb
182
189
  - lib/scrappy/repository.rb
183
- - lib/scrappy/selectors/base_uri.rb
184
- - lib/scrappy/selectors/css.rb
185
- - lib/scrappy/selectors/new_uri.rb
186
- - lib/scrappy/selectors/root.rb
187
- - lib/scrappy/selectors/section.rb
188
- - lib/scrappy/selectors/slice.rb
189
- - lib/scrappy/selectors/uri.rb
190
- - lib/scrappy/selectors/uri_pattern.rb
191
- - lib/scrappy/selectors/xpath.rb
192
190
  - lib/scrappy/server/admin.rb
193
191
  - lib/scrappy/server/errors.rb
194
192
  - lib/scrappy/server/helpers.rb
195
193
  - lib/scrappy/server/server.rb
196
194
  - lib/scrappy/support.rb
195
+ - lib/scrappy/trainer/trainer.rb
197
196
  files:
198
197
  - History.txt
199
198
  - Manifest
@@ -206,41 +205,48 @@ files:
206
205
  - lib/scrappy/agent/blind_agent.rb
207
206
  - lib/scrappy/agent/cache.rb
208
207
  - lib/scrappy/agent/dumper.rb
209
- - lib/scrappy/agent/extractor.rb
210
- - lib/scrappy/agent/formats.rb
211
208
  - lib/scrappy/agent/map_reduce.rb
209
+ - lib/scrappy/extractor/extractor.rb
210
+ - lib/scrappy/extractor/formats.rb
211
+ - lib/scrappy/extractor/fragment.rb
212
+ - lib/scrappy/extractor/selector.rb
213
+ - lib/scrappy/extractor/selectors/base_uri.rb
214
+ - lib/scrappy/extractor/selectors/css.rb
215
+ - lib/scrappy/extractor/selectors/new_uri.rb
216
+ - lib/scrappy/extractor/selectors/root.rb
217
+ - lib/scrappy/extractor/selectors/section.rb
218
+ - lib/scrappy/extractor/selectors/slice.rb
219
+ - lib/scrappy/extractor/selectors/uri.rb
220
+ - lib/scrappy/extractor/selectors/uri_pattern.rb
221
+ - lib/scrappy/extractor/selectors/visual.rb
222
+ - lib/scrappy/extractor/selectors/xpath.rb
212
223
  - lib/scrappy/repository.rb
213
- - lib/scrappy/selectors/base_uri.rb
214
- - lib/scrappy/selectors/css.rb
215
- - lib/scrappy/selectors/new_uri.rb
216
- - lib/scrappy/selectors/root.rb
217
- - lib/scrappy/selectors/section.rb
218
- - lib/scrappy/selectors/slice.rb
219
- - lib/scrappy/selectors/uri.rb
220
- - lib/scrappy/selectors/uri_pattern.rb
221
- - lib/scrappy/selectors/xpath.rb
222
224
  - lib/scrappy/server/admin.rb
223
225
  - lib/scrappy/server/errors.rb
224
226
  - lib/scrappy/server/helpers.rb
225
227
  - lib/scrappy/server/server.rb
226
228
  - lib/scrappy/support.rb
229
+ - lib/scrappy/trainer/trainer.rb
227
230
  - public/favicon.ico
228
231
  - public/images/logo.png
229
232
  - public/images/logo_tiny.png
230
- - public/javascripts/scrappy.js
233
+ - public/javascripts/annotator.js
234
+ - public/javascripts/remote.js
231
235
  - public/stylesheets/application.css
232
236
  - test/test_helper.rb
233
237
  - test/test_scrappy.rb
238
+ - views/extractors.haml
234
239
  - views/help.haml
235
240
  - views/home.haml
236
- - views/kb.haml
237
241
  - views/layout.haml
242
+ - views/patterns.haml
243
+ - views/samples.haml
238
244
  - scrappy.gemspec
239
245
  has_rdoc: true
240
246
  homepage: http://github.com/josei/scrappy
241
247
  licenses: []
242
248
 
243
- post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
249
+ post_install_message:
244
250
  rdoc_options:
245
251
  - --line-numbers
246
252
  - --inline-source
@@ -251,20 +257,16 @@ rdoc_options:
251
257
  require_paths:
252
258
  - lib
253
259
  required_ruby_version: !ruby/object:Gem::Requirement
254
- none: false
255
260
  requirements:
256
261
  - - ">="
257
262
  - !ruby/object:Gem::Version
258
- hash: 3
259
263
  segments:
260
264
  - 0
261
265
  version: "0"
262
266
  required_rubygems_version: !ruby/object:Gem::Requirement
263
- none: false
264
267
  requirements:
265
268
  - - ">="
266
269
  - !ruby/object:Gem::Version
267
- hash: 11
268
270
  segments:
269
271
  - 1
270
272
  - 2
@@ -272,10 +274,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
272
274
  requirements: []
273
275
 
274
276
  rubyforge_project: scrappy
275
- rubygems_version: 1.3.7
277
+ rubygems_version: 1.3.6
276
278
  signing_key:
277
279
  specification_version: 3
278
280
  summary: Web scraper that allows producing RDF data out of plain web pages
279
281
  test_files:
280
- - test/test_helper.rb
281
282
  - test/test_scrappy.rb
283
+ - test/test_helper.rb
@@ -1,196 +0,0 @@
1
- require 'digest/md5'
2
-
3
- module Scrappy
4
- module Extractor
5
- def extract uri, html, referenceable=nil
6
- if options.debug
7
- print "Extracting #{uri}..."; $stdout.flush
8
- end
9
-
10
- @selector_pool ||= {}
11
- triples = []
12
- content = Nokogiri::HTML(html, nil, 'utf-8')
13
-
14
- uri_selectors = (kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).flatten.select do |uri_selector|
15
- results = selector_pool(uri_selector).filter :content=>content, :uri=>uri
16
- !results.empty?
17
- end
18
-
19
- fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
20
-
21
- fragments.each do |fragment|
22
- extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
23
- :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
24
- end
25
-
26
- add_referenceable_data content, triples, referenceable if referenceable
27
-
28
- puts "done!" if options.debug
29
-
30
- triples.map do |s,p,o|
31
- [ s.is_a?(RDF::Node) ? s.id : s,
32
- p.is_a?(RDF::Node) ? p.id : p,
33
- o.is_a?(RDF::Node) ? o.id : o ]
34
- end
35
- end
36
-
37
- private
38
- def extract_fragment fragment, options={}
39
- node = Node(options[:parent])
40
- uri = options[:doc][:uri]
41
-
42
- # Select nodes
43
- docs = fragment.sc::selector.map { |s| filter s, options[:doc] }.flatten
44
-
45
- # Generate triples
46
- docs.each do |doc|
47
- # Build URIs if identifier present
48
- nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map do |d|
49
- node = Node(parse_uri(uri, d[:value]))
50
- if options[:referenceable]
51
- # Include the fragment where the URI was built from
52
- uri_node = Node(nil)
53
- options[:triples] << [ node, Node("sc:uri"), uri_node ]
54
- options[:triples] << [ uri_node, Node("rdf:value"), node.to_s ]
55
- options[:triples] << [ uri_node, Node("sc:source"), Node(node_hash(d[:uri], d[:content].path)) ]
56
- end
57
- node
58
- end
59
- nodes << Node(nil) if nodes.empty?
60
-
61
- nodes.each do |node|
62
- # Build the object
63
- object = if fragment.sc::type.include?(Node('rdf:Literal'))
64
- value = doc[:value].to_s.strip
65
- if options[:referenceable]
66
- bnode = Node(nil)
67
- bnode.rdf::value = value
68
- bnode.rdf::type = Node('rdf:Literal')
69
- options[:triples].push *bnode.triples
70
- bnode
71
- else
72
- value
73
- end
74
- else
75
- fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
76
- fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
77
- fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
78
- node
79
- end
80
- fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
81
-
82
- # Add referenceable data if requested
83
- if options[:referenceable]
84
- sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
85
- sources.each do |source|
86
- options[:triples] << [ object, Node("sc:source"), source ]
87
- fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
88
- fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
89
- end
90
- end
91
-
92
- # Process subfragments
93
- fragment.sc::subfragment.each { |subfragment| extract_fragment subfragment, options.merge(:doc=>doc, :parent=>object) }
94
- end
95
- end
96
- end
97
-
98
- def filter selector, doc
99
- if selector.sc::debug.first=="true" and options.debug
100
- puts '== DEBUG'
101
- puts '== Selector:'
102
- puts selector.serialize(:yarf, false)
103
- puts '== On fragment:'
104
- puts "URI: #{doc[:uri]}"
105
- puts "Content: #{doc[:content]}"
106
- puts "Value: #{doc[:value]}"
107
- end
108
-
109
- # Process selector
110
- results = selector_pool(selector).filter doc
111
-
112
- if selector.sc::debug.first=="true" and options.debug
113
- puts "== No results" if results.empty?
114
- results.each_with_index do |result, i|
115
- puts "== Result ##{i}:"
116
- puts "URI: #{result[:uri]}"
117
- puts "Content: #{result[:content]}"
118
- puts "Value: #{result[:value].inspect}"
119
- end
120
- puts
121
- end
122
-
123
- # Return results if no nested selectors
124
- return results if selector.sc::selector.empty?
125
-
126
- # Process nested selectors
127
- results.map do |result|
128
- selector.sc::selector.map { |s| filter s, result }
129
- end.flatten
130
- end
131
-
132
- def parse_uri(uri, rel_uri)
133
- return ID('*') if rel_uri.nil?
134
- begin
135
- ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri).to_s)
136
- rescue
137
- ID('*')
138
- end
139
- end
140
-
141
- def add_referenceable_data content, triples, referenceable
142
- resources = {}; triples.each { |s,p,o| resources[o] = true }
143
-
144
- fragment = Node(node_hash(uri, '/'))
145
- selector = Node(nil)
146
- presentation = Node(nil)
147
-
148
- selector.rdf::type = Node('sc:UnivocalSelector')
149
- selector.sc::path = '/'
150
- selector.sc::document = uri
151
-
152
- fragment.sc::selector = selector
153
-
154
- triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
155
-
156
- content.search('*').each do |node|
157
- next if node.text?
158
-
159
- fragment = Node(node_hash(uri, node.path))
160
-
161
- if referenceable == :dump or resources[fragment]
162
- selector = Node(nil)
163
- presentation = Node(nil)
164
-
165
- triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
166
- triples << [selector, ID('sc:path'), node.path.to_s]
167
- triples << [selector, ID('sc:tag'), node.name.to_s]
168
- triples << [selector, ID('sc:document'), uri]
169
-
170
- triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
171
- triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
172
- triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
173
- triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
174
- triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
175
- triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
176
- triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
177
- triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
178
- triples << [presentation, ID('sc:text'), node.text.strip]
179
- triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
180
-
181
- triples << [fragment, ID('sc:selector'), selector]
182
- triples << [fragment, ID('sc:presentation'), presentation]
183
- end
184
- end
185
- end
186
-
187
- def node_hash uri, path
188
- digest = Digest::MD5.hexdigest("#{uri} #{path}")
189
- :"_:bnode#{digest}"
190
- end
191
-
192
- def selector_pool selector
193
- @selector_pool[selector.id] ||= kb.node(selector)
194
- end
195
- end
196
- end