scrappy 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,92 @@
1
+ sioc: http://rdfs.org/sioc/ns#
2
+ sc: http://lab.gsi.dit.upm.es/scraping.rdf#
3
+ loc: http://www.daml.org/experiment/ontology/location-ont#
4
+
5
+ _:elmundoindice:
6
+ rdf:type: sc:Fragment
7
+ sc:selector:
8
+ *:
9
+ rdf:type: sc:UriSelector
10
+ rdf:value: "http://www.elmundo.es/"
11
+ sc:identifier:
12
+ *:
13
+ rdf:type: sc:BaseUriSelector
14
+ sc:subfragment:
15
+ *:
16
+ sc:type: sioc:Post
17
+ sc:selector:
18
+ *:
19
+ rdf:type: sc:CssSelector
20
+ rdf:value: ".noticia h2, .noticia h3, .noticia h4"
21
+ sc:identifier:
22
+ *:
23
+ rdf:type: sc:CssSelector
24
+ rdf:value: "a"
25
+ sc:attribute: "href"
26
+ sc:subfragment:
27
+ *:
28
+ sc:type: rdf:Literal
29
+ sc:relation: dc:title
30
+ sc:selector:
31
+ *:
32
+ rdf:type: sc:CssSelector
33
+ rdf:value: "a"
34
+
35
+ _:elmundonoticia:
36
+ rdf:type: sc:Fragment
37
+ sc:type: sioc:Post
38
+ sc:selector:
39
+ *:
40
+ rdf:type: sc:UriPatternSelector
41
+ rdf:value: "http://www.elmundo.es/*"
42
+ sc:identifier:
43
+ *:
44
+ rdf:type: sc:BaseUriSelector
45
+ sc:subfragment:
46
+ *:
47
+ sc:type: rdf:Literal
48
+ sc:relation: dc:creator
49
+ sc:selector:
50
+ *:
51
+ rdf:type: sc:CssSelector
52
+ rdf:value: ".noticia .firma em"
53
+ *:
54
+ sc:type: rdf:Literal
55
+ sc:relation: dc:title
56
+ sc:selector:
57
+ *:
58
+ rdf:type: sc:CssSelector
59
+ rdf:value: ".noticia h2"
60
+ *:
61
+ sc:type: loc:Location
62
+ sc:relation: loc:location
63
+ sc:selector:
64
+ *:
65
+ rdf:type: sc:CssSelector
66
+ rdf:value: ".noticia .firma .localizacion"
67
+ sc:subfragment:
68
+ *:
69
+ sc:type: rdf:Literal
70
+ sc:relation: rdf:label
71
+ sc:selector:
72
+ *:
73
+ rdf:type: sc:RootSelector
74
+ *:
75
+ sc:type: rdf:Literal
76
+ sc:relation: dc:date
77
+ sc:selector:
78
+ *:
79
+ rdf:type: sc:CssSelector
80
+ rdf:value: ".metadata_noticia .fecha"
81
+ *:
82
+ sc:type: rdf:Literal
83
+ sc:relation: dc:description
84
+ sc:selector:
85
+ *:
86
+ rdf:type: sc:CssSelector
87
+ rdf:value: ".contenido_noticia_01 .antetitulo"
88
+ sc:selector:
89
+ *:
90
+ rdf:type: sc:SliceSelector
91
+ rdf:value: "|"
92
+ sc:index: "1"
@@ -0,0 +1,22 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'nokogiri'
5
+ require 'thread'
6
+ require 'monitor'
7
+ require 'mechanize'
8
+ require 'ostruct'
9
+ require 'active_support'
10
+ require 'tmpdir'
11
+ require 'lightrdf'
12
+
13
+ require 'scrappy/support'
14
+ require 'scrappy/agent/extractor'
15
+ require 'scrappy/agent/cluster'
16
+ require 'scrappy/agent/agent'
17
+
18
+ Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
19
+
20
+ module Scrappy
21
+ VERSION = '0.1'
22
+ end
@@ -0,0 +1,90 @@
1
+ module Scrappy
2
+ class Agent
3
+ include Extractor
4
+ include MonitorMixin
5
+ include Cluster
6
+
7
+ Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0
8
+ ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
9
+ :rdf => 'application/rdf+xml' }
10
+
11
+ def self.pool
12
+ @pool ||= {}
13
+ end
14
+ def self.[] id
15
+ pool[id] || Agent.create(:id=>id)
16
+ end
17
+
18
+ def self.create args={}
19
+ if (args[:agent] || Options.agent) == :visual
20
+ require 'scrappy/agent/visual_agent'
21
+ VisualAgent.new args
22
+ else
23
+ require 'scrappy/agent/blind_agent'
24
+ BlindAgent.new args
25
+ end
26
+ end
27
+
28
+ attr_accessor :id, :output, :content_type, :status, :options, :kb
29
+
30
+ def initialize args={}
31
+ super()
32
+ @id = args[:id] || Agent.pool.keys.size
33
+ Agent.pool[@id] = self
34
+ @kb = args[:kb] || Options.kb
35
+ @options = Options.clone
36
+ end
37
+
38
+ def request http_method, uri, inputs={}, depth=options.depth
39
+ synchronize do
40
+ uri = "#{uri}.com" if uri =~ /\A\w+\Z/
41
+ uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
42
+
43
+ # Perform the request
44
+ if http_method == :get
45
+ self.uri = uri
46
+ return RDF::Graph.new unless self.html_data?
47
+ else
48
+ raise Exception, 'POST requests not supported yet'
49
+ end
50
+
51
+ # Adds tags including visual information
52
+ add_visual_data! if options.referenceable
53
+
54
+ # Extract data
55
+ triples = extract self.uri, html, options.referenceable
56
+
57
+ # Iterate through subresources
58
+ if depth > 0
59
+ uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
60
+ Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
61
+ end
62
+ RDF::Graph.new(triples.uniq)
63
+ end
64
+ end
65
+
66
+ def proxy http_method, uri, inputs={}, format=options.format, depth=options.depth
67
+ synchronize do
68
+ if @status == :redirect and uri == self.uri
69
+ @status = :ok
70
+ else
71
+ @output = request(http_method, uri, inputs, depth).serialize(format)
72
+ @content_type = ContentTypes[format] || 'text/plain'
73
+ @status = if self.html_data?
74
+ self.uri == uri ? :ok : :redirect
75
+ else
76
+ :error
77
+ end
78
+ end
79
+
80
+ @output
81
+ end
82
+ end
83
+
84
+ # Method used when consuming a list of uris
85
+ def process uri, args={}
86
+ sleep 0.001 * options.delay.to_f
87
+ request(:get, uri, {}, args[:depth]).triples
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,34 @@
1
+ module Scrappy
2
+ class BlindAgent < Agent
3
+ def initialize args={}
4
+ super
5
+ @mechanize = Mechanize.new
6
+ end
7
+
8
+ def uri
9
+ @loaded ? @mechanize.current_page.uri.to_s : nil
10
+ end
11
+
12
+ def uri= uri
13
+ synchronize do
14
+ begin
15
+ @mechanize.get uri
16
+ @loaded = true
17
+ rescue
18
+ @loaded = false
19
+ end
20
+ end
21
+ end
22
+
23
+ def html_data?
24
+ !uri.nil? and @mechanize.current_page.is_a?(Mechanize::Page)
25
+ end
26
+
27
+ def html
28
+ @mechanize.current_page.root.to_html :encoding=>'UTF-8'
29
+ end
30
+
31
+ def add_visual_data!
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,35 @@
1
+ module Cluster
2
+
3
+ def self.included(klass)
4
+ klass.extend ClassMethods
5
+ klass.extend MonitorMixin
6
+ end
7
+
8
+ def consume(list, results, args={})
9
+ begin
10
+ element = list.synchronize { list.pop }
11
+ unless element.nil?
12
+ result = process(element, args)
13
+ results.synchronize { results << result }
14
+ end
15
+ end until element.nil?
16
+ end
17
+
18
+ module ClassMethods
19
+ def cluster; @cluster; end
20
+ def cluster= value; @cluster=value; end
21
+
22
+ def create_cluster count, *args
23
+ self.cluster = (1..count).map { args.nil? ? create : create(*args) }
24
+ end
25
+
26
+ def process(list=[], args={})
27
+ results = []
28
+ list.extend MonitorMixin
29
+ results.extend MonitorMixin
30
+ cluster.map { |o| Thread.new { o.consume(list, results, args) } }.each { |t| t.join }
31
+ results
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,159 @@
1
+ module Scrappy
2
+ module Extractor
3
+ def extract uri, html, referenceable=nil
4
+ triples = []
5
+ content = Nokogiri::HTML(html, nil, 'utf-8')
6
+ uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')).select{ |n| n.rdf::value.include?(uri.match(/\A([^\?]*)(\?.*\Z)?/).captures.first) }
7
+ uri_selectors += kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).select{|n| n.rdf::value.any?{|v| /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ =~ uri} }
8
+
9
+ fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
10
+ fragments.each do |fragment|
11
+ extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
12
+ :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
13
+ end
14
+
15
+ add_referenceable_data content, triples, referenceable if referenceable
16
+
17
+ triples
18
+ end
19
+
20
+ private
21
+ def extract_fragment fragment, options={}
22
+ node = Node(options[:parent])
23
+ uri = options[:doc][:uri]
24
+
25
+ # Select nodes
26
+ docs = fragment.sc::selector.map { |s| filter s, options[:doc] }.flatten
27
+
28
+ # Generate triples
29
+ docs.each do |doc|
30
+ # Build URIs if identifier present
31
+ nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map{ |d| Node(parse_uri(uri, d[:value])) }
32
+ nodes << Node(nil) if nodes.empty?
33
+
34
+ nodes.each do |node|
35
+ # Build the object
36
+ object = if fragment.sc::type.first == Node('rdf:Literal')
37
+ value = doc[:value].strip
38
+ if options[:referenceable]
39
+ bnode = Node(nil)
40
+ bnode.rdf::value = value
41
+ bnode.rdf::type = Node('rdf:Literal')
42
+ bnode
43
+ else
44
+ value
45
+ end
46
+ elsif fragment.sc::type.first
47
+ options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
48
+ node
49
+ else
50
+ node
51
+ end
52
+ fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
53
+
54
+ # Add referenceable data if requested
55
+ if options[:referenceable]
56
+ source = Node("_:#{doc[:uri]}|#{doc[:content].path}")
57
+ options[:triples] << [ object, Node("sc:source"), source ]
58
+ fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
59
+ fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
60
+ end
61
+
62
+ # Process subfragments
63
+ fragment.sc::subfragment.each { |subfragment| extract_fragment subfragment, options.merge(:doc=>doc, :parent=>object) }
64
+ end
65
+ end
66
+ end
67
+
68
+ def filter selector, doc
69
+ content = doc[:content]
70
+ uri = doc[:uri]
71
+ results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
72
+ selector.rdf::type.include?(Node('sc:XPathSelector'))
73
+ selector.rdf::value.map do |pattern|
74
+ content.search(pattern).map do |result|
75
+ if selector.sc::attribute.first
76
+ # Select node's attribute if given
77
+ selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
78
+ else
79
+ # Select node
80
+ [ { :uri=>uri, :content=>result, :value=>result.text } ]
81
+ end
82
+ end
83
+ end.flatten
84
+
85
+ elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
86
+ text = content.text
87
+ selector.rdf::value.map do |separator|
88
+ slices = text.split(separator)
89
+ selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
90
+ end.flatten
91
+
92
+ elsif selector.rdf::type.include?(Node('sc:BaseUriSelector'))
93
+ [ { :uri=>uri, :content=>content, :value=>uri } ]
94
+
95
+ else
96
+ [ { :uri=>uri, :content=>content, :value=>content.text } ]
97
+ end
98
+
99
+ # Process nested selectors, if any
100
+ return results if selector.sc::selector.empty?
101
+ results.map do |result|
102
+ selector.sc::selector.map { |s| filter s, result }
103
+ end.flatten
104
+ end
105
+
106
+ def parse_uri(uri, rel_uri)
107
+ return ID('*') if rel_uri.nil?
108
+ begin
109
+ ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri))
110
+ rescue
111
+ ID('*')
112
+ end
113
+ end
114
+
115
+ def add_referenceable_data content, triples, referenceable
116
+ resources = triples.map{|s,p,o| [[s],[o]]}.flatten
117
+
118
+ fragment = Node("_:#{uri}|/")
119
+ selector = Node(nil)
120
+ presentation = Node(nil)
121
+
122
+ selector.rdf::type = Node('sc:UnivocalSelector')
123
+ selector.sc::path = '/'
124
+ selector.sc::uri = uri
125
+
126
+ fragment.sc::selector = selector
127
+
128
+ triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
129
+
130
+ content.search('*').each do |node|
131
+ fragment = Node("_:#{uri}|#{node.path}")
132
+
133
+ if referenceable == :dump or resources.include?(fragment)
134
+ selector = Node(nil)
135
+ presentation = Node(nil)
136
+
137
+ selector.rdf::type = Node('sc:UnivocalSelector')
138
+ selector.sc::path = node.path.to_s
139
+ selector.sc::tag = node.name.to_s
140
+ selector.sc::uri = uri
141
+
142
+ presentation.sc::x = node[:vx].to_s if node[:vx]
143
+ presentation.sc::y = node[:vy].to_s if node[:vy]
144
+ presentation.sc::width = node[:vw].to_s if node[:vw]
145
+ presentation.sc::height = node[:vh].to_s if node[:vh]
146
+ presentation.sc::font_size = node[:vsize].gsub("px","").to_s if node[:vsize]
147
+ presentation.sc::font_weight = node[:vweight].to_s if node[:vweight]
148
+ presentation.sc::color = node[:vcolor].to_s if node[:vcolor]
149
+ presentation.sc::background_color = node[:vbcolor].to_s if node[:vbcolor]
150
+
151
+ fragment.sc::selector = selector
152
+ fragment.sc::presentation = presentation unless presentation.empty?
153
+
154
+ triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples)
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,72 @@
1
+ # Hack to hide annoying gtk debug messages
2
+ old_stderr = $stderr.clone
3
+ $stderr.reopen '/dev/null'
4
+ require 'scrappy/webkit/webkit'
5
+ $stderr = old_stderr
6
+
7
+ module Scrappy
8
+ class VisualAgent < Agent
9
+ def initialize args={}
10
+ super
11
+
12
+ @cv = new_cond
13
+
14
+ @webview = Gtk::WebKit::WebView.new
15
+ @webview.signal_connect("load_finished") { synchronize { @cv.signal } }
16
+
17
+ @window = Gtk::Window.new
18
+ @window.signal_connect("destroy") { Gtk.main_quit }
19
+ @window.add(@webview)
20
+ @window.set_size_request(1024, 600)
21
+ @window.show_all if args[:window] or (args[:window].nil? and Agent::Options.window)
22
+ end
23
+
24
+ def uri
25
+ @webview.uri
26
+ end
27
+
28
+ def uri= uri
29
+ synchronize do
30
+ @webview.open uri.to_s
31
+ @cv.wait(60) # 1 minute to open the page
32
+ sleep(1) while !Nokogiri::HTML(html).search("head").empty? and Nokogiri::HTML(html).search("body").empty?
33
+ end
34
+ end
35
+
36
+ def html_data?
37
+ uri.to_s != ""
38
+ end
39
+
40
+ def html
41
+ js "document.documentElement.outerHTML"
42
+ end
43
+
44
+ def add_visual_data!
45
+ js """var items = document.documentElement.getElementsByTagName('*');
46
+ var i=0;
47
+ for(var i=0; i<items.length; i++) {
48
+ var item = items[i];
49
+ item.setAttribute('vx', item.offsetLeft)
50
+ item.setAttribute('vy', item.offsetTop)
51
+ item.setAttribute('vw', item.offsetWidth)
52
+ item.setAttribute('vh', item.offsetHeight)
53
+ item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'))
54
+ item.setAttribute('vweight', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight'))
55
+ item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'))
56
+ item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'))
57
+ }"""
58
+ end
59
+
60
+
61
+ private
62
+ def js code
63
+ old_title = @webview.title
64
+ @webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
65
+ title = ActiveSupport::JSON.decode(@webview.title)
66
+ @webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
67
+ title
68
+ end
69
+ end
70
+ end
71
+
72
+ Thread.new { Gtk.main }