scrappy 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,92 @@
1
+ sioc: http://rdfs.org/sioc/ns#
2
+ sc: http://lab.gsi.dit.upm.es/scraping.rdf#
3
+ loc: http://www.daml.org/experiment/ontology/location-ont#
4
+
5
+ _:elmundoindice:
6
+ rdf:type: sc:Fragment
7
+ sc:selector:
8
+ *:
9
+ rdf:type: sc:UriSelector
10
+ rdf:value: "http://www.elmundo.es/"
11
+ sc:identifier:
12
+ *:
13
+ rdf:type: sc:BaseUriSelector
14
+ sc:subfragment:
15
+ *:
16
+ sc:type: sioc:Post
17
+ sc:selector:
18
+ *:
19
+ rdf:type: sc:CssSelector
20
+ rdf:value: ".noticia h2, .noticia h3, .noticia h4"
21
+ sc:identifier:
22
+ *:
23
+ rdf:type: sc:CssSelector
24
+ rdf:value: "a"
25
+ sc:attribute: "href"
26
+ sc:subfragment:
27
+ *:
28
+ sc:type: rdf:Literal
29
+ sc:relation: dc:title
30
+ sc:selector:
31
+ *:
32
+ rdf:type: sc:CssSelector
33
+ rdf:value: "a"
34
+
35
+ _:elmundonoticia:
36
+ rdf:type: sc:Fragment
37
+ sc:type: sioc:Post
38
+ sc:selector:
39
+ *:
40
+ rdf:type: sc:UriPatternSelector
41
+ rdf:value: "http://www.elmundo.es/*"
42
+ sc:identifier:
43
+ *:
44
+ rdf:type: sc:BaseUriSelector
45
+ sc:subfragment:
46
+ *:
47
+ sc:type: rdf:Literal
48
+ sc:relation: dc:creator
49
+ sc:selector:
50
+ *:
51
+ rdf:type: sc:CssSelector
52
+ rdf:value: ".noticia .firma em"
53
+ *:
54
+ sc:type: rdf:Literal
55
+ sc:relation: dc:title
56
+ sc:selector:
57
+ *:
58
+ rdf:type: sc:CssSelector
59
+ rdf:value: ".noticia h2"
60
+ *:
61
+ sc:type: loc:Location
62
+ sc:relation: loc:location
63
+ sc:selector:
64
+ *:
65
+ rdf:type: sc:CssSelector
66
+ rdf:value: ".noticia .firma .localizacion"
67
+ sc:subfragment:
68
+ *:
69
+ sc:type: rdf:Literal
70
+ sc:relation: rdf:label
71
+ sc:selector:
72
+ *:
73
+ rdf:type: sc:RootSelector
74
+ *:
75
+ sc:type: rdf:Literal
76
+ sc:relation: dc:date
77
+ sc:selector:
78
+ *:
79
+ rdf:type: sc:CssSelector
80
+ rdf:value: ".metadata_noticia .fecha"
81
+ *:
82
+ sc:type: rdf:Literal
83
+ sc:relation: dc:description
84
+ sc:selector:
85
+ *:
86
+ rdf:type: sc:CssSelector
87
+ rdf:value: ".contenido_noticia_01 .antetitulo"
88
+ sc:selector:
89
+ *:
90
+ rdf:type: sc:SliceSelector
91
+ rdf:value: "|"
92
+ sc:index: "1"
@@ -0,0 +1,22 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'nokogiri'
5
+ require 'thread'
6
+ require 'monitor'
7
+ require 'mechanize'
8
+ require 'ostruct'
9
+ require 'active_support'
10
+ require 'tmpdir'
11
+ require 'lightrdf'
12
+
13
+ require 'scrappy/support'
14
+ require 'scrappy/agent/extractor'
15
+ require 'scrappy/agent/cluster'
16
+ require 'scrappy/agent/agent'
17
+
18
+ Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
19
+
20
+ module Scrappy
21
+ VERSION = '0.1'
22
+ end
@@ -0,0 +1,90 @@
1
+ module Scrappy
2
+ class Agent
3
+ include Extractor
4
+ include MonitorMixin
5
+ include Cluster
6
+
7
+ Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0
8
+ ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
9
+ :rdf => 'application/rdf+xml' }
10
+
11
+ def self.pool
12
+ @pool ||= {}
13
+ end
14
+ def self.[] id
15
+ pool[id] || Agent.create(:id=>id)
16
+ end
17
+
18
+ def self.create args={}
19
+ if (args[:agent] || Options.agent) == :visual
20
+ require 'scrappy/agent/visual_agent'
21
+ VisualAgent.new args
22
+ else
23
+ require 'scrappy/agent/blind_agent'
24
+ BlindAgent.new args
25
+ end
26
+ end
27
+
28
+ attr_accessor :id, :output, :content_type, :status, :options, :kb
29
+
30
+ def initialize args={}
31
+ super()
32
+ @id = args[:id] || Agent.pool.keys.size
33
+ Agent.pool[@id] = self
34
+ @kb = args[:kb] || Options.kb
35
+ @options = Options.clone
36
+ end
37
+
38
+ def request http_method, uri, inputs={}, depth=options.depth
39
+ synchronize do
40
+ uri = "#{uri}.com" if uri =~ /\A\w+\Z/
41
+ uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
42
+
43
+ # Perform the request
44
+ if http_method == :get
45
+ self.uri = uri
46
+ return RDF::Graph.new unless self.html_data?
47
+ else
48
+ raise Exception, 'POST requests not supported yet'
49
+ end
50
+
51
+ # Adds tags including visual information
52
+ add_visual_data! if options.referenceable
53
+
54
+ # Extract data
55
+ triples = extract self.uri, html, options.referenceable
56
+
57
+ # Iterate through subresources
58
+ if depth > 0
59
+ uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
60
+ Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
61
+ end
62
+ RDF::Graph.new(triples.uniq)
63
+ end
64
+ end
65
+
66
+ def proxy http_method, uri, inputs={}, format=options.format, depth=options.depth
67
+ synchronize do
68
+ if @status == :redirect and uri == self.uri
69
+ @status = :ok
70
+ else
71
+ @output = request(http_method, uri, inputs, depth).serialize(format)
72
+ @content_type = ContentTypes[format] || 'text/plain'
73
+ @status = if self.html_data?
74
+ self.uri == uri ? :ok : :redirect
75
+ else
76
+ :error
77
+ end
78
+ end
79
+
80
+ @output
81
+ end
82
+ end
83
+
84
+ # Method used when consuming a list of uris
85
+ def process uri, args={}
86
+ sleep 0.001 * options.delay.to_f
87
+ request(:get, uri, {}, args[:depth]).triples
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,34 @@
1
+ module Scrappy
2
+ class BlindAgent < Agent
3
+ def initialize args={}
4
+ super
5
+ @mechanize = Mechanize.new
6
+ end
7
+
8
+ def uri
9
+ @loaded ? @mechanize.current_page.uri.to_s : nil
10
+ end
11
+
12
+ def uri= uri
13
+ synchronize do
14
+ begin
15
+ @mechanize.get uri
16
+ @loaded = true
17
+ rescue
18
+ @loaded = false
19
+ end
20
+ end
21
+ end
22
+
23
+ def html_data?
24
+ !uri.nil? and @mechanize.current_page.is_a?(Mechanize::Page)
25
+ end
26
+
27
+ def html
28
+ @mechanize.current_page.root.to_html :encoding=>'UTF-8'
29
+ end
30
+
31
+ def add_visual_data!
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,35 @@
1
+ module Cluster
2
+
3
+ def self.included(klass)
4
+ klass.extend ClassMethods
5
+ klass.extend MonitorMixin
6
+ end
7
+
8
+ def consume(list, results, args={})
9
+ begin
10
+ element = list.synchronize { list.pop }
11
+ unless element.nil?
12
+ result = process(element, args)
13
+ results.synchronize { results << result }
14
+ end
15
+ end until element.nil?
16
+ end
17
+
18
+ module ClassMethods
19
+ def cluster; @cluster; end
20
+ def cluster= value; @cluster=value; end
21
+
22
+ def create_cluster count, *args
23
+ self.cluster = (1..count).map { args.nil? ? create : create(*args) }
24
+ end
25
+
26
+ def process(list=[], args={})
27
+ results = []
28
+ list.extend MonitorMixin
29
+ results.extend MonitorMixin
30
+ cluster.map { |o| Thread.new { o.consume(list, results, args) } }.each { |t| t.join }
31
+ results
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,159 @@
1
+ module Scrappy
2
+ module Extractor
3
+ def extract uri, html, referenceable=nil
4
+ triples = []
5
+ content = Nokogiri::HTML(html, nil, 'utf-8')
6
+ uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')).select{ |n| n.rdf::value.include?(uri.match(/\A([^\?]*)(\?.*\Z)?/).captures.first) }
7
+ uri_selectors += kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).select{|n| n.rdf::value.any?{|v| /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ =~ uri} }
8
+
9
+ fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
10
+ fragments.each do |fragment|
11
+ extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
12
+ :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
13
+ end
14
+
15
+ add_referenceable_data content, triples, referenceable if referenceable
16
+
17
+ triples
18
+ end
19
+
20
+ private
21
+ def extract_fragment fragment, options={}
22
+ node = Node(options[:parent])
23
+ uri = options[:doc][:uri]
24
+
25
+ # Select nodes
26
+ docs = fragment.sc::selector.map { |s| filter s, options[:doc] }.flatten
27
+
28
+ # Generate triples
29
+ docs.each do |doc|
30
+ # Build URIs if identifier present
31
+ nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map{ |d| Node(parse_uri(uri, d[:value])) }
32
+ nodes << Node(nil) if nodes.empty?
33
+
34
+ nodes.each do |node|
35
+ # Build the object
36
+ object = if fragment.sc::type.first == Node('rdf:Literal')
37
+ value = doc[:value].strip
38
+ if options[:referenceable]
39
+ bnode = Node(nil)
40
+ bnode.rdf::value = value
41
+ bnode.rdf::type = Node('rdf:Literal')
42
+ bnode
43
+ else
44
+ value
45
+ end
46
+ elsif fragment.sc::type.first
47
+ options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
48
+ node
49
+ else
50
+ node
51
+ end
52
+ fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
53
+
54
+ # Add referenceable data if requested
55
+ if options[:referenceable]
56
+ source = Node("_:#{doc[:uri]}|#{doc[:content].path}")
57
+ options[:triples] << [ object, Node("sc:source"), source ]
58
+ fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
59
+ fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
60
+ end
61
+
62
+ # Process subfragments
63
+ fragment.sc::subfragment.each { |subfragment| extract_fragment subfragment, options.merge(:doc=>doc, :parent=>object) }
64
+ end
65
+ end
66
+ end
67
+
68
+ def filter selector, doc
69
+ content = doc[:content]
70
+ uri = doc[:uri]
71
+ results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
72
+ selector.rdf::type.include?(Node('sc:XPathSelector'))
73
+ selector.rdf::value.map do |pattern|
74
+ content.search(pattern).map do |result|
75
+ if selector.sc::attribute.first
76
+ # Select node's attribute if given
77
+ selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
78
+ else
79
+ # Select node
80
+ [ { :uri=>uri, :content=>result, :value=>result.text } ]
81
+ end
82
+ end
83
+ end.flatten
84
+
85
+ elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
86
+ text = content.text
87
+ selector.rdf::value.map do |separator|
88
+ slices = text.split(separator)
89
+ selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
90
+ end.flatten
91
+
92
+ elsif selector.rdf::type.include?(Node('sc:BaseUriSelector'))
93
+ [ { :uri=>uri, :content=>content, :value=>uri } ]
94
+
95
+ else
96
+ [ { :uri=>uri, :content=>content, :value=>content.text } ]
97
+ end
98
+
99
+ # Process nested selectors, if any
100
+ return results if selector.sc::selector.empty?
101
+ results.map do |result|
102
+ selector.sc::selector.map { |s| filter s, result }
103
+ end.flatten
104
+ end
105
+
106
+ def parse_uri(uri, rel_uri)
107
+ return ID('*') if rel_uri.nil?
108
+ begin
109
+ ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri))
110
+ rescue
111
+ ID('*')
112
+ end
113
+ end
114
+
115
+ def add_referenceable_data content, triples, referenceable
116
+ resources = triples.map{|s,p,o| [[s],[o]]}.flatten
117
+
118
+ fragment = Node("_:#{uri}|/")
119
+ selector = Node(nil)
120
+ presentation = Node(nil)
121
+
122
+ selector.rdf::type = Node('sc:UnivocalSelector')
123
+ selector.sc::path = '/'
124
+ selector.sc::uri = uri
125
+
126
+ fragment.sc::selector = selector
127
+
128
+ triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
129
+
130
+ content.search('*').each do |node|
131
+ fragment = Node("_:#{uri}|#{node.path}")
132
+
133
+ if referenceable == :dump or resources.include?(fragment)
134
+ selector = Node(nil)
135
+ presentation = Node(nil)
136
+
137
+ selector.rdf::type = Node('sc:UnivocalSelector')
138
+ selector.sc::path = node.path.to_s
139
+ selector.sc::tag = node.name.to_s
140
+ selector.sc::uri = uri
141
+
142
+ presentation.sc::x = node[:vx].to_s if node[:vx]
143
+ presentation.sc::y = node[:vy].to_s if node[:vy]
144
+ presentation.sc::width = node[:vw].to_s if node[:vw]
145
+ presentation.sc::height = node[:vh].to_s if node[:vh]
146
+ presentation.sc::font_size = node[:vsize].gsub("px","").to_s if node[:vsize]
147
+ presentation.sc::font_weight = node[:vweight].to_s if node[:vweight]
148
+ presentation.sc::color = node[:vcolor].to_s if node[:vcolor]
149
+ presentation.sc::background_color = node[:vbcolor].to_s if node[:vbcolor]
150
+
151
+ fragment.sc::selector = selector
152
+ fragment.sc::presentation = presentation unless presentation.empty?
153
+
154
+ triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples)
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,72 @@
1
+ # Hack to hide annoying gtk debug messages
2
+ old_stderr = $stderr.clone
3
+ $stderr.reopen '/dev/null'
4
+ require 'scrappy/webkit/webkit'
5
+ $stderr = old_stderr
6
+
7
+ module Scrappy
8
+ class VisualAgent < Agent
9
+ def initialize args={}
10
+ super
11
+
12
+ @cv = new_cond
13
+
14
+ @webview = Gtk::WebKit::WebView.new
15
+ @webview.signal_connect("load_finished") { synchronize { @cv.signal } }
16
+
17
+ @window = Gtk::Window.new
18
+ @window.signal_connect("destroy") { Gtk.main_quit }
19
+ @window.add(@webview)
20
+ @window.set_size_request(1024, 600)
21
+ @window.show_all if args[:window] or (args[:window].nil? and Agent::Options.window)
22
+ end
23
+
24
+ def uri
25
+ @webview.uri
26
+ end
27
+
28
+ def uri= uri
29
+ synchronize do
30
+ @webview.open uri.to_s
31
+ @cv.wait(60) # 1 minute to open the page
32
+ sleep(1) while !Nokogiri::HTML(html).search("head").empty? and Nokogiri::HTML(html).search("body").empty?
33
+ end
34
+ end
35
+
36
+ def html_data?
37
+ uri.to_s != ""
38
+ end
39
+
40
+ def html
41
+ js "document.documentElement.outerHTML"
42
+ end
43
+
44
+ def add_visual_data!
45
+ js """var items = document.documentElement.getElementsByTagName('*');
46
+ var i=0;
47
+ for(var i=0; i<items.length; i++) {
48
+ var item = items[i];
49
+ item.setAttribute('vx', item.offsetLeft)
50
+ item.setAttribute('vy', item.offsetTop)
51
+ item.setAttribute('vw', item.offsetWidth)
52
+ item.setAttribute('vh', item.offsetHeight)
53
+ item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'))
54
+ item.setAttribute('vweight', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight'))
55
+ item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'))
56
+ item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'))
57
+ }"""
58
+ end
59
+
60
+
61
+ private
62
+ def js code
63
+ old_title = @webview.title
64
+ @webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
65
+ title = ActiveSupport::JSON.decode(@webview.title)
66
+ @webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
67
+ title
68
+ end
69
+ end
70
+ end
71
+
72
+ Thread.new { Gtk.main }