scrappy 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ require 'camping'
2
+ require 'camping/session'
3
+ require 'open3'
4
+
5
+ Camping.goes :Scrappy
6
+
7
+ module Scrappy
8
+ module Controllers
9
+ class Index < R '.*'
10
+ include InputEscaping
11
+
12
+ def get
13
+ process_request :get
14
+ end
15
+
16
+ def post
17
+ process_request :post
18
+ end
19
+
20
+ protected
21
+ def process_request http_method
22
+ agent.proxy http_method, request.env["REQUEST_URI"], @input
23
+
24
+ case agent.status
25
+ when :redirect
26
+ redirect agent.uri
27
+ when :ok
28
+ @headers['Content-Type'] = agent.content_type
29
+ agent.output
30
+ else
31
+ @status = 500
32
+ 'Error'
33
+ end
34
+ end
35
+
36
+ def agent
37
+ Scrappy::Agent[@request.env["REMOTE_ADDR"]]
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,77 @@
1
+ require 'camping'
2
+ require 'camping/session'
3
+ require 'open3'
4
+
5
+ Camping.goes :Scrappy
6
+
7
+ module Scrappy
8
+ include Camping::Session
9
+ secret '1a36591bceec49c832079e270d7e8b73'
10
+
11
+ module Controllers
12
+ class Index
13
+ def get
14
+ mab do
15
+ html do
16
+ head {}
17
+ body do
18
+ h1 "Scrappy Web Server"
19
+ p "Use following URL format: http://[host]/[format]/[url]"
20
+ p do
21
+ "For example: " + a("http://localhost:3434/rdfxml/http://www.google.com",
22
+ :href=>"http://localhost:3434/rdfxml/http://www.google.com")
23
+ end
24
+ p do
25
+ "Remember to escape parameters: " +
26
+ "http://www.example.com/~user/%3Ftest%3D1%26test1%3D2<br/> or<br/> " +
27
+ "http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2<br/>" +
28
+ "instead of<br/> http://www.example.com/~user/?test=1&test1=2"
29
+ end
30
+ p do
31
+ "Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson"
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ class Extract < R '/(\w+)/(.+)'
40
+ include InputEscaping
41
+
42
+ def get format, url
43
+ process_request :get, format, url
44
+ end
45
+
46
+ def post format, url
47
+ process_request :post, format, url
48
+ end
49
+
50
+ protected
51
+ def process_request http_method, format, url
52
+ callback = @input['callback']
53
+ agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
54
+
55
+ case agent.status
56
+ when :redirect
57
+ redirect "/#{format}/#{agent.uri}#{inputs}"
58
+ when :ok
59
+ @headers['Content-Type'] = agent.content_type
60
+ callback ? "#{callback}(#{agent.output})" : agent.output
61
+ else
62
+ @status = 500
63
+ 'Error'
64
+ end
65
+ end
66
+
67
+ def agent
68
+ return @agent if @agent
69
+ if @state[:agent].nil? || @state[:token] != SESSION_TOKEN
70
+ @state[:token] = SESSION_TOKEN
71
+ @state[:agent] = Scrappy::Agent.create.id
72
+ end
73
+ @agent = Scrappy::Agent[@state[:agent]]
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,70 @@
1
+ module Scrappy
2
+ class Shell
3
+ def initialize file=nil
4
+ @agent = Agent.create
5
+ @file = file
6
+ end
7
+
8
+ def run
9
+ commands = ['get', 'put', 'help']
10
+
11
+ Readline.completion_append_character = " "
12
+ Readline.completer_word_break_characters = ""
13
+ Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
14
+
15
+ if @file
16
+ open(@file, 'r').lines.each do |line|
17
+ break if process(line) == :quit
18
+ end
19
+ else
20
+ begin
21
+ line = Readline.readline(bash, true)
22
+ code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
23
+ end while code != :quit
24
+ end
25
+ end
26
+
27
+ protected
28
+ def process raw_command
29
+ command = raw_command.strip
30
+
31
+ code = if command =~ /\Aget\W(.*)\Z/
32
+ puts @agent.proxy :get, $1
33
+ puts ''
34
+ elsif command == 'help'
35
+ puts 'Available commands:'
36
+ puts ' get URL: Visit the specified URL'
37
+ puts ' help: Show this information'
38
+ puts ' quit: Exit scrappy shell'
39
+ puts ''
40
+ elsif command == 'quit'
41
+ :quit
42
+ elsif command == '' or command[0..0] == '#'
43
+ nil
44
+ else
45
+ puts "ERROR: Unknown command '#{command}'"
46
+ puts ''
47
+ end
48
+ code
49
+ end
50
+
51
+ def bash
52
+ return '' if Options.quiet
53
+ location = if @agent.uri
54
+ uri = URI::parse(@agent.uri)
55
+ path = uri.path.to_s
56
+ path = path[0..0] + "..." + path[-16..-1] if path.size > 20
57
+ if uri.query
58
+ query = "?" + uri.query
59
+ query = "?..." + query[-10..-1] if query.size > 13
60
+ else
61
+ query = ""
62
+ end
63
+ "#{uri.base}#{path}#{query}"
64
+ else
65
+ ''
66
+ end
67
+ "#{location}$ "
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,18 @@
1
+ require 'open-uri'
2
+ require 'net/http'
3
+ require 'net/https'
4
+
5
+ module URI
6
+ def base
7
+ self.to_s.split('/')[0..2] * '/'
8
+ end
9
+ end
10
+
11
+ module Scrappy
12
+ module InputEscaping
13
+ def inputs
14
+ return '' if @input.empty?
15
+ "?" + (@input.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ require 'gtk2'
2
+ module Gtk
3
+ module WebKit
4
+ end
5
+ end
6
+
7
+ require 'rbwebkitgtk.so'
8
+
9
+ class Gtk::WebKit::WebView
10
+ alias :load_html_string_no_defaults :load_html_string
11
+ def load_html_string(content, base_uri=nil)
12
+ load_html_string_no_defaults(content, base_uri)
13
+ end
14
+
15
+ def mark_text_matches(test, case_sensitive=false, limit=0)
16
+ mark_text_matches_with_limit(test, case_sensitive, limit)
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/scrappy'
@@ -0,0 +1,11 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ class TestScrappy < Test::Unit::TestCase
4
+
5
+ def setup
6
+ end
7
+
8
+ def test_truth
9
+ assert true
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,233 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrappy
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ version: "0.1"
9
+ platform: ruby
10
+ authors:
11
+ - Jose Ignacio
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+
16
+ date: 2010-10-07 00:00:00 +02:00
17
+ default_executable:
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: activesupport
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ segments:
27
+ - 2
28
+ - 3
29
+ - 5
30
+ version: 2.3.5
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: markaby
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ segments:
41
+ - 0
42
+ - 7
43
+ - 1
44
+ version: 0.7.1
45
+ type: :runtime
46
+ version_requirements: *id002
47
+ - !ruby/object:Gem::Dependency
48
+ name: camping
49
+ prerelease: false
50
+ requirement: &id003 !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 2
56
+ - 0
57
+ version: "2.0"
58
+ type: :runtime
59
+ version_requirements: *id003
60
+ - !ruby/object:Gem::Dependency
61
+ name: nokogiri
62
+ prerelease: false
63
+ requirement: &id004 !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 1
69
+ - 4
70
+ - 1
71
+ version: 1.4.1
72
+ type: :runtime
73
+ version_requirements: *id004
74
+ - !ruby/object:Gem::Dependency
75
+ name: mechanize
76
+ prerelease: false
77
+ requirement: &id005 !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 1
83
+ - 0
84
+ - 0
85
+ version: 1.0.0
86
+ type: :runtime
87
+ version_requirements: *id005
88
+ - !ruby/object:Gem::Dependency
89
+ name: lightrdf
90
+ prerelease: false
91
+ requirement: &id006 !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
97
+ - 1
98
+ version: "0.1"
99
+ type: :runtime
100
+ version_requirements: *id006
101
+ - !ruby/object:Gem::Dependency
102
+ name: rubyforge
103
+ prerelease: false
104
+ requirement: &id007 !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ segments:
109
+ - 2
110
+ - 0
111
+ - 4
112
+ version: 2.0.4
113
+ type: :development
114
+ version_requirements: *id007
115
+ - !ruby/object:Gem::Dependency
116
+ name: hoe
117
+ prerelease: false
118
+ requirement: &id008 !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ segments:
123
+ - 2
124
+ - 6
125
+ - 0
126
+ version: 2.6.0
127
+ type: :development
128
+ version_requirements: *id008
129
+ description: |-
130
+ Scrappy is a tool that allows extracting information from web pages and producing RDF data.
131
+ It uses the scraping ontology to define the mappings between HTML contents and RDF data.
132
+
133
+ An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
134
+
135
+ dc: http://purl.org/dc/elements/1.1/
136
+ rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
137
+ sioc: http://rdfs.org/sioc/ns#
138
+ sc: http://lab.gsi.dit.upm.es/scraping.rdf#
139
+ *:
140
+ rdf:type: sc:Fragment
141
+ sc:selector:
142
+ *:
143
+ rdf:type: sc:UriSelector
144
+ rdf:value: "http://www.elmundo.es/"
145
+ sc:identifier:
146
+ *:
147
+ rdf:type: sc:BaseUriSelector
148
+ sc:subfragment:
149
+ *:
150
+ sc:type: sioc:Post
151
+ sc:selector:
152
+ *:
153
+ rdf:type: sc:CssSelector
154
+ rdf:value: ".noticia h2, .noticia h3, .noticia h4"
155
+ sc:identifier:
156
+ *:
157
+ rdf:type: sc:CssSelector
158
+ rdf:value: "a"
159
+ sc:attribute: "href"
160
+ sc:subfragment:
161
+ *:
162
+ sc:type: rdf:Literal
163
+ sc:relation: dc:title
164
+ sc:selector:
165
+ *:
166
+ rdf:type: sc:CssSelector
167
+ rdf:value: "a"
168
+
169
+ (The above code is serialized using YARF format, supported by LightRDF gem, as well as
170
+ RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
171
+ email:
172
+ - joseignacio.fernandez@gmail.com
173
+ executables:
174
+ - scrappy
175
+ extensions: []
176
+
177
+ extra_rdoc_files:
178
+ - History.txt
179
+ - Manifest.txt
180
+ files:
181
+ - History.txt
182
+ - Manifest.txt
183
+ - README.rdoc
184
+ - Rakefile
185
+ - bin/scrappy
186
+ - kb/elmundo.yarf
187
+ - lib/scrappy.rb
188
+ - lib/scrappy/agent/agent.rb
189
+ - lib/scrappy/agent/blind_agent.rb
190
+ - lib/scrappy/agent/cluster.rb
191
+ - lib/scrappy/agent/extractor.rb
192
+ - lib/scrappy/agent/visual_agent.rb
193
+ - lib/scrappy/proxy.rb
194
+ - lib/scrappy/server.rb
195
+ - lib/scrappy/shell.rb
196
+ - lib/scrappy/support.rb
197
+ - lib/scrappy/webkit/webkit.rb
198
+ - test/test_helper.rb
199
+ - test/test_scrappy.rb
200
+ has_rdoc: true
201
+ homepage: http://github.com/josei/scrappy
202
+ licenses: []
203
+
204
+ post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
205
+ rdoc_options:
206
+ - --main
207
+ - README.rdoc
208
+ require_paths:
209
+ - lib
210
+ required_ruby_version: !ruby/object:Gem::Requirement
211
+ requirements:
212
+ - - ">="
213
+ - !ruby/object:Gem::Version
214
+ segments:
215
+ - 0
216
+ version: "0"
217
+ required_rubygems_version: !ruby/object:Gem::Requirement
218
+ requirements:
219
+ - - ">="
220
+ - !ruby/object:Gem::Version
221
+ segments:
222
+ - 0
223
+ version: "0"
224
+ requirements: []
225
+
226
+ rubyforge_project: scrappy
227
+ rubygems_version: 1.3.6
228
+ signing_key:
229
+ specification_version: 3
230
+ summary: Web scraper that allows producing RDF data out of plain web pages
231
+ test_files:
232
+ - test/test_scrappy.rb
233
+ - test/test_helper.rb