scrappy 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,41 @@
1
+ require 'camping'
2
+ require 'camping/session'
3
+ require 'open3'
4
+
5
+ Camping.goes :Scrappy
6
+
7
+ module Scrappy
8
+ module Controllers
9
+ class Index < R '.*'
10
+ include InputEscaping
11
+
12
+ def get
13
+ process_request :get
14
+ end
15
+
16
+ def post
17
+ process_request :post
18
+ end
19
+
20
+ protected
21
+ def process_request http_method
22
+ agent.proxy http_method, request.env["REQUEST_URI"], @input
23
+
24
+ case agent.status
25
+ when :redirect
26
+ redirect agent.uri
27
+ when :ok
28
+ @headers['Content-Type'] = agent.content_type
29
+ agent.output
30
+ else
31
+ @status = 500
32
+ 'Error'
33
+ end
34
+ end
35
+
36
+ def agent
37
+ Scrappy::Agent[@request.env["REMOTE_ADDR"]]
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,77 @@
1
+ require 'camping'
2
+ require 'camping/session'
3
+ require 'open3'
4
+
5
+ Camping.goes :Scrappy
6
+
7
+ module Scrappy
8
+ include Camping::Session
9
+ secret '1a36591bceec49c832079e270d7e8b73'
10
+
11
+ module Controllers
12
+ class Index
13
+ def get
14
+ mab do
15
+ html do
16
+ head {}
17
+ body do
18
+ h1 "Scrappy Web Server"
19
+ p "Use following URL format: http://[host]/[format]/[url]"
20
+ p do
21
+ "For example: " + a("http://localhost:3434/rdfxml/http://www.google.com",
22
+ :href=>"http://localhost:3434/rdfxml/http://www.google.com")
23
+ end
24
+ p do
25
+ "Remember to escape parameters: " +
26
+ "http://www.example.com/~user/%3Ftest%3D1%26test1%3D2<br/> or<br/> " +
27
+ "http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2<br/>" +
28
+ "instead of<br/> http://www.example.com/~user/?test=1&test1=2"
29
+ end
30
+ p do
31
+ "Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson"
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ class Extract < R '/(\w+)/(.+)'
40
+ include InputEscaping
41
+
42
+ def get format, url
43
+ process_request :get, format, url
44
+ end
45
+
46
+ def post format, url
47
+ process_request :post, format, url
48
+ end
49
+
50
+ protected
51
+ def process_request http_method, format, url
52
+ callback = @input['callback']
53
+ agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
54
+
55
+ case agent.status
56
+ when :redirect
57
+ redirect "/#{format}/#{agent.uri}#{inputs}"
58
+ when :ok
59
+ @headers['Content-Type'] = agent.content_type
60
+ callback ? "#{callback}(#{agent.output})" : agent.output
61
+ else
62
+ @status = 500
63
+ 'Error'
64
+ end
65
+ end
66
+
67
+ def agent
68
+ return @agent if @agent
69
+ if @state[:agent].nil? || @state[:token] != SESSION_TOKEN
70
+ @state[:token] = SESSION_TOKEN
71
+ @state[:agent] = Scrappy::Agent.create.id
72
+ end
73
+ @agent = Scrappy::Agent[@state[:agent]]
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,70 @@
1
+ module Scrappy
2
+ class Shell
3
+ def initialize file=nil
4
+ @agent = Agent.create
5
+ @file = file
6
+ end
7
+
8
+ def run
9
+ commands = ['get', 'put', 'help']
10
+
11
+ Readline.completion_append_character = " "
12
+ Readline.completer_word_break_characters = ""
13
+ Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
14
+
15
+ if @file
16
+ open(@file, 'r').lines.each do |line|
17
+ break if process(line) == :quit
18
+ end
19
+ else
20
+ begin
21
+ line = Readline.readline(bash, true)
22
+ code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
23
+ end while code != :quit
24
+ end
25
+ end
26
+
27
+ protected
28
+ def process raw_command
29
+ command = raw_command.strip
30
+
31
+ code = if command =~ /\Aget\W(.*)\Z/
32
+ puts @agent.proxy :get, $1
33
+ puts ''
34
+ elsif command == 'help'
35
+ puts 'Available commands:'
36
+ puts ' get URL: Visit the specified URL'
37
+ puts ' help: Show this information'
38
+ puts ' quit: Exit scrappy shell'
39
+ puts ''
40
+ elsif command == 'quit'
41
+ :quit
42
+ elsif command == '' or command[0..0] == '#'
43
+ nil
44
+ else
45
+ puts "ERROR: Unknown command '#{command}'"
46
+ puts ''
47
+ end
48
+ code
49
+ end
50
+
51
+ def bash
52
+ return '' if Options.quiet
53
+ location = if @agent.uri
54
+ uri = URI::parse(@agent.uri)
55
+ path = uri.path.to_s
56
+ path = path[0..0] + "..." + path[-16..-1] if path.size > 20
57
+ if uri.query
58
+ query = "?" + uri.query
59
+ query = "?..." + query[-10..-1] if query.size > 13
60
+ else
61
+ query = ""
62
+ end
63
+ "#{uri.base}#{path}#{query}"
64
+ else
65
+ ''
66
+ end
67
+ "#{location}$ "
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,18 @@
1
+ require 'open-uri'
2
+ require 'net/http'
3
+ require 'net/https'
4
+
5
+ module URI
6
+ def base
7
+ self.to_s.split('/')[0..2] * '/'
8
+ end
9
+ end
10
+
11
+ module Scrappy
12
+ module InputEscaping
13
+ def inputs
14
+ return '' if @input.empty?
15
+ "?" + (@input.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ require 'gtk2'
2
+ module Gtk
3
+ module WebKit
4
+ end
5
+ end
6
+
7
+ require 'rbwebkitgtk.so'
8
+
9
+ class Gtk::WebKit::WebView
10
+ alias :load_html_string_no_defaults :load_html_string
11
+ def load_html_string(content, base_uri=nil)
12
+ load_html_string_no_defaults(content, base_uri)
13
+ end
14
+
15
+ def mark_text_matches(test, case_sensitive=false, limit=0)
16
+ mark_text_matches_with_limit(test, case_sensitive, limit)
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/scrappy'
@@ -0,0 +1,11 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ class TestScrappy < Test::Unit::TestCase
4
+
5
+ def setup
6
+ end
7
+
8
+ def test_truth
9
+ assert true
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,233 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrappy
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ version: "0.1"
9
+ platform: ruby
10
+ authors:
11
+ - Jose Ignacio
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+
16
+ date: 2010-10-07 00:00:00 +02:00
17
+ default_executable:
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: activesupport
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ segments:
27
+ - 2
28
+ - 3
29
+ - 5
30
+ version: 2.3.5
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: markaby
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ segments:
41
+ - 0
42
+ - 7
43
+ - 1
44
+ version: 0.7.1
45
+ type: :runtime
46
+ version_requirements: *id002
47
+ - !ruby/object:Gem::Dependency
48
+ name: camping
49
+ prerelease: false
50
+ requirement: &id003 !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 2
56
+ - 0
57
+ version: "2.0"
58
+ type: :runtime
59
+ version_requirements: *id003
60
+ - !ruby/object:Gem::Dependency
61
+ name: nokogiri
62
+ prerelease: false
63
+ requirement: &id004 !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 1
69
+ - 4
70
+ - 1
71
+ version: 1.4.1
72
+ type: :runtime
73
+ version_requirements: *id004
74
+ - !ruby/object:Gem::Dependency
75
+ name: mechanize
76
+ prerelease: false
77
+ requirement: &id005 !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 1
83
+ - 0
84
+ - 0
85
+ version: 1.0.0
86
+ type: :runtime
87
+ version_requirements: *id005
88
+ - !ruby/object:Gem::Dependency
89
+ name: lightrdf
90
+ prerelease: false
91
+ requirement: &id006 !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
97
+ - 1
98
+ version: "0.1"
99
+ type: :runtime
100
+ version_requirements: *id006
101
+ - !ruby/object:Gem::Dependency
102
+ name: rubyforge
103
+ prerelease: false
104
+ requirement: &id007 !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ segments:
109
+ - 2
110
+ - 0
111
+ - 4
112
+ version: 2.0.4
113
+ type: :development
114
+ version_requirements: *id007
115
+ - !ruby/object:Gem::Dependency
116
+ name: hoe
117
+ prerelease: false
118
+ requirement: &id008 !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ segments:
123
+ - 2
124
+ - 6
125
+ - 0
126
+ version: 2.6.0
127
+ type: :development
128
+ version_requirements: *id008
129
+ description: |-
130
+ Scrappy is a tool that allows extracting information from web pages and producing RDF data.
131
+ It uses the scraping ontology to define the mappings between HTML contents and RDF data.
132
+
133
+ An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
134
+
135
+ dc: http://purl.org/dc/elements/1.1/
136
+ rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
137
+ sioc: http://rdfs.org/sioc/ns#
138
+ sc: http://lab.gsi.dit.upm.es/scraping.rdf#
139
+ *:
140
+ rdf:type: sc:Fragment
141
+ sc:selector:
142
+ *:
143
+ rdf:type: sc:UriSelector
144
+ rdf:value: "http://www.elmundo.es/"
145
+ sc:identifier:
146
+ *:
147
+ rdf:type: sc:BaseUriSelector
148
+ sc:subfragment:
149
+ *:
150
+ sc:type: sioc:Post
151
+ sc:selector:
152
+ *:
153
+ rdf:type: sc:CssSelector
154
+ rdf:value: ".noticia h2, .noticia h3, .noticia h4"
155
+ sc:identifier:
156
+ *:
157
+ rdf:type: sc:CssSelector
158
+ rdf:value: "a"
159
+ sc:attribute: "href"
160
+ sc:subfragment:
161
+ *:
162
+ sc:type: rdf:Literal
163
+ sc:relation: dc:title
164
+ sc:selector:
165
+ *:
166
+ rdf:type: sc:CssSelector
167
+ rdf:value: "a"
168
+
169
+ (The above code is serialized using YARF format, supported by LightRDF gem, as well as
170
+ RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
171
+ email:
172
+ - joseignacio.fernandez@gmail.com
173
+ executables:
174
+ - scrappy
175
+ extensions: []
176
+
177
+ extra_rdoc_files:
178
+ - History.txt
179
+ - Manifest.txt
180
+ files:
181
+ - History.txt
182
+ - Manifest.txt
183
+ - README.rdoc
184
+ - Rakefile
185
+ - bin/scrappy
186
+ - kb/elmundo.yarf
187
+ - lib/scrappy.rb
188
+ - lib/scrappy/agent/agent.rb
189
+ - lib/scrappy/agent/blind_agent.rb
190
+ - lib/scrappy/agent/cluster.rb
191
+ - lib/scrappy/agent/extractor.rb
192
+ - lib/scrappy/agent/visual_agent.rb
193
+ - lib/scrappy/proxy.rb
194
+ - lib/scrappy/server.rb
195
+ - lib/scrappy/shell.rb
196
+ - lib/scrappy/support.rb
197
+ - lib/scrappy/webkit/webkit.rb
198
+ - test/test_helper.rb
199
+ - test/test_scrappy.rb
200
+ has_rdoc: true
201
+ homepage: http://github.com/josei/scrappy
202
+ licenses: []
203
+
204
+ post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
205
+ rdoc_options:
206
+ - --main
207
+ - README.rdoc
208
+ require_paths:
209
+ - lib
210
+ required_ruby_version: !ruby/object:Gem::Requirement
211
+ requirements:
212
+ - - ">="
213
+ - !ruby/object:Gem::Version
214
+ segments:
215
+ - 0
216
+ version: "0"
217
+ required_rubygems_version: !ruby/object:Gem::Requirement
218
+ requirements:
219
+ - - ">="
220
+ - !ruby/object:Gem::Version
221
+ segments:
222
+ - 0
223
+ version: "0"
224
+ requirements: []
225
+
226
+ rubyforge_project: scrappy
227
+ rubygems_version: 1.3.6
228
+ signing_key:
229
+ specification_version: 3
230
+ summary: Web scraper that allows producing RDF data out of plain web pages
231
+ test_files:
232
+ - test/test_scrappy.rb
233
+ - test/test_helper.rb