scrappy 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/Manifest +13 -9
- data/README.rdoc +26 -26
- data/bin/scrappy +12 -9
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +2 -1
- data/lib/scrappy/agent/visual_agent.rb +2 -1
- data/lib/scrappy/server/admin.rb +24 -0
- data/lib/scrappy/server/helpers.rb +23 -0
- data/lib/scrappy/server/server.rb +5 -13
- data/public/favicon.ico +0 -0
- data/{lib/scrappy/server/public → public}/images/logo.png +0 -0
- data/public/images/logo_tiny.png +0 -0
- data/{lib/js/annotator.js → public/javascripts/scrappy.js} +21 -0
- data/public/stylesheets/application.css +171 -0
- data/scrappy.gemspec +4 -4
- data/views/help.haml +20 -0
- data/views/home.haml +14 -0
- data/views/kb.haml +15 -0
- data/views/layout.haml +22 -0
- metadata +20 -21
- data/lib/scrappy/server/proxy.rb +0 -34
- data/lib/scrappy/server/public/images/logo_small.png +0 -0
- data/lib/scrappy/server/public/stylesheets/application.css +0 -51
- data/lib/scrappy/server/views/help.haml +0 -25
- data/lib/scrappy/server/views/home.haml +0 -26
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -4,16 +4,16 @@ README.rdoc
|
|
4
4
|
Rakefile
|
5
5
|
bin/scrappy
|
6
6
|
kb/elmundo.yarf
|
7
|
-
lib/js/annotator.js
|
8
7
|
lib/scrappy.rb
|
9
8
|
lib/scrappy/agent/agent.rb
|
10
9
|
lib/scrappy/agent/blind_agent.rb
|
11
10
|
lib/scrappy/agent/cache.rb
|
12
11
|
lib/scrappy/agent/dumper.rb
|
12
|
+
lib/scrappy/agent/extractor.rb
|
13
13
|
lib/scrappy/agent/formats.rb
|
14
14
|
lib/scrappy/agent/map_reduce.rb
|
15
|
-
lib/scrappy/agent/extractor.rb
|
16
15
|
lib/scrappy/agent/visual_agent.rb
|
16
|
+
lib/scrappy/repository.rb
|
17
17
|
lib/scrappy/selectors/base_uri.rb
|
18
18
|
lib/scrappy/selectors/css.rb
|
19
19
|
lib/scrappy/selectors/new_uri.rb
|
@@ -23,16 +23,20 @@ lib/scrappy/selectors/slice.rb
|
|
23
23
|
lib/scrappy/selectors/uri.rb
|
24
24
|
lib/scrappy/selectors/uri_pattern.rb
|
25
25
|
lib/scrappy/selectors/xpath.rb
|
26
|
-
lib/scrappy/server/
|
26
|
+
lib/scrappy/server/admin.rb
|
27
|
+
lib/scrappy/server/helpers.rb
|
27
28
|
lib/scrappy/server/server.rb
|
28
|
-
lib/scrappy/server/public/images/logo.png
|
29
|
-
lib/scrappy/server/public/images/logo_small.png
|
30
|
-
lib/scrappy/server/public/stylesheets/application.css
|
31
|
-
lib/scrappy/server/views/home.haml
|
32
|
-
lib/scrappy/server/views/help.haml
|
33
|
-
lib/scrappy/repository.rb
|
34
29
|
lib/scrappy/shell.rb
|
35
30
|
lib/scrappy/support.rb
|
36
31
|
lib/scrappy/webkit/webkit.rb
|
32
|
+
public/favicon.ico
|
33
|
+
public/images/logo.png
|
34
|
+
public/images/logo_tiny.png
|
35
|
+
public/javascripts/scrappy.js
|
36
|
+
public/stylesheets/application.css
|
37
37
|
test/test_helper.rb
|
38
38
|
test/test_scrappy.rb
|
39
|
+
views/help.haml
|
40
|
+
views/home.haml
|
41
|
+
views/kb.haml
|
42
|
+
views/layout.haml
|
data/README.rdoc
CHANGED
@@ -140,37 +140,37 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
140
140
|
|
141
141
|
* RDF repository:
|
142
142
|
|
143
|
-
Sesame functionality has been included in Scrappy. You can configure
|
144
|
-
the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
|
145
|
-
An example of this file can be found at the end of this README.
|
143
|
+
Sesame functionality has been included in Scrappy. You can configure
|
144
|
+
the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
|
145
|
+
An example of this file can be found at the end of this README.
|
146
146
|
|
147
|
-
You can get the data for a certain period of time, by using the time (-t, --time) option:
|
147
|
+
You can get the data for a certain period of time, by using the time (-t, --time) option:
|
148
148
|
|
149
|
-
|
149
|
+
$ scrappy -g example.org -t 3
|
150
150
|
|
151
|
-
This would get all the data stored in Sesame for example.org in the last 3 minutes.
|
151
|
+
This would get all the data stored in Sesame for example.org in the last 3 minutes.
|
152
152
|
|
153
153
|
* Sample config.yml
|
154
154
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
155
|
+
# This is a sample configuration file, with the options to communicate with Sesame using Scrappy
|
156
|
+
repository:
|
157
|
+
# The host were Sesame is. Do not add the trailing '/'
|
158
|
+
host: http://localhost
|
159
|
+
|
160
|
+
# The port for the connection
|
161
|
+
port: 8080
|
162
|
+
|
163
|
+
# The time to consider the data in the repository valid, in minutes
|
164
|
+
time: 15
|
165
|
+
|
166
|
+
# The name of the repository
|
167
|
+
repository: memory
|
168
|
+
|
169
|
+
# The format to communicate with the repository
|
170
|
+
format: ntriples
|
171
|
+
|
172
|
+
# You can use any of the following formats:
|
173
|
+
# rdfxml, ntriples, turtle, n3, trix, trig
|
174
174
|
|
175
175
|
|
176
176
|
== INSTALL:
|
@@ -222,4 +222,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
222
222
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
223
223
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
224
224
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
225
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
225
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/scrappy
CHANGED
@@ -40,12 +40,12 @@ module Scrappy
|
|
40
40
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
41
41
|
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
42
42
|
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
43
|
-
opts.on('-
|
43
|
+
opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
|
44
44
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
45
45
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
46
46
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
47
47
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
48
|
-
opts.on('-V', '--visual') { Agent::Options.agent = :visual
|
48
|
+
opts.on('-V', '--visual') { Agent::Options.agent = :visual }
|
49
49
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
50
50
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
51
51
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
@@ -63,16 +63,19 @@ module Scrappy
|
|
63
63
|
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
|
64
64
|
elsif Options.observe
|
65
65
|
Agent.create.observe(Options.observe)
|
66
|
-
elsif Options.
|
67
|
-
puts "Launching Scrappy Web
|
68
|
-
require 'scrappy/server/
|
66
|
+
elsif Options.admin
|
67
|
+
puts "Launching Scrappy Admin Web Server (browse http://localhost:#{Options.port})..."
|
68
|
+
require 'scrappy/server/server'
|
69
69
|
Thin::Logging.silent = true
|
70
|
-
Scrappy::
|
70
|
+
Scrappy::Server.register Scrappy::Admin
|
71
|
+
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
72
|
+
:base_uri=>Options.base_uri
|
71
73
|
elsif Options.server
|
72
|
-
puts "Launching Scrappy Web Server
|
74
|
+
puts "Launching Scrappy Web Server..."
|
73
75
|
require 'scrappy/server/server'
|
74
76
|
Thin::Logging.silent = true
|
75
|
-
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment
|
77
|
+
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
78
|
+
:base_uri => Options.base_uri
|
76
79
|
elsif Options.shell
|
77
80
|
puts "Launching Scrappy Shell..."
|
78
81
|
require 'scrappy/shell'
|
@@ -123,7 +126,7 @@ Options
|
|
123
126
|
-w, --window Shows browser window (requires -v)
|
124
127
|
|
125
128
|
Authors
|
126
|
-
José Ignacio Fernández, Jacobo Blasco
|
129
|
+
José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
|
127
130
|
|
128
131
|
Copyright
|
129
132
|
Copyright (c) 2010 José Ignacio Fernández. Licensed under the MIT License:
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -24,6 +24,7 @@ module Scrappy
|
|
24
24
|
@window.show_all
|
25
25
|
@visible = true
|
26
26
|
end
|
27
|
+
@mechanize = Mechanize.new
|
27
28
|
end
|
28
29
|
|
29
30
|
def uri
|
@@ -33,7 +34,7 @@ module Scrappy
|
|
33
34
|
def uri= uri
|
34
35
|
# First, check if the requested uri is a valid HTML page
|
35
36
|
valid = begin
|
36
|
-
|
37
|
+
@mechanize.get(uri).is_a?(Mechanize::Page)
|
37
38
|
rescue
|
38
39
|
false
|
39
40
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module Admin
|
3
|
+
def self.registered app
|
4
|
+
app.get '/' do
|
5
|
+
if params[:format] and params[:uri]
|
6
|
+
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
|
7
|
+
else
|
8
|
+
haml :home
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
app.get '/help' do
|
13
|
+
haml :help
|
14
|
+
end
|
15
|
+
|
16
|
+
app.get '/kb' do
|
17
|
+
@uris = ( Agent::Options.kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
|
18
|
+
Agent::Options.kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
|
19
|
+
map { |node| node.rdf::value }.flatten.sort.map(&:to_s)
|
20
|
+
haml :kb
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module JavaScriptHelpers
|
3
|
+
def bookmark_js
|
4
|
+
"javascript:(function(){" +
|
5
|
+
"if(!document.getElementById('scrappy')){" +
|
6
|
+
"var e=document.createElement('script');" +
|
7
|
+
"e.src='https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js';" +
|
8
|
+
"e.id='scrappy';" +
|
9
|
+
"document.getElementsByTagName('head')[0].appendChild(e);};" +
|
10
|
+
"if(!window.scrappy_loaded){" +
|
11
|
+
"e=document.createElement('script');" +
|
12
|
+
"e.src='http://localhost:3434/javascripts/scrappy.js?_=#{Time.now.to_i}';" +
|
13
|
+
"e.onerror=function(){alert('Error: Please start Scrappy Server at http://localhost:3434');};" +
|
14
|
+
"document.getElementsByTagName('head')[0].appendChild(e);" +
|
15
|
+
"}"+
|
16
|
+
"})();"
|
17
|
+
end
|
18
|
+
|
19
|
+
def drag_js
|
20
|
+
"alert(\"Don't click this. Drag it to your bookmarks\"); return false;"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -1,26 +1,18 @@
|
|
1
1
|
require 'sinatra'
|
2
2
|
require 'thin'
|
3
3
|
require 'haml'
|
4
|
+
require 'scrappy/server/helpers'
|
5
|
+
require 'scrappy/server/admin'
|
4
6
|
|
5
7
|
module Scrappy
|
6
8
|
class Server < Sinatra::Base
|
9
|
+
helpers JavaScriptHelpers
|
10
|
+
|
7
11
|
enable :sessions
|
8
|
-
set :root, File.dirname(__FILE__)
|
12
|
+
set :root, File.join(File.dirname(__FILE__), '..', '..', '..')
|
9
13
|
set :views, Proc.new { File.join(root, "views") }
|
10
14
|
set :public, Proc.new { File.join(root, "public") }
|
11
15
|
|
12
|
-
get '/' do
|
13
|
-
if params[:format] and params[:uri]
|
14
|
-
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
|
15
|
-
else
|
16
|
-
haml :home
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
get '/help' do
|
21
|
-
haml :help
|
22
|
-
end
|
23
|
-
|
24
16
|
get '/:format/*' do |format, url|
|
25
17
|
process_request :get, format, url, params[:callback]
|
26
18
|
end
|
data/public/favicon.ico
ADDED
Binary file
|
File without changes
|
Binary file
|
@@ -1,3 +1,22 @@
|
|
1
|
+
add_visual_data = function() {
|
2
|
+
var items = document.documentElement.getElementsByTagName('*');
|
3
|
+
var i=0;
|
4
|
+
for(var i=0; i<items.length; i++) {
|
5
|
+
var item = items[i];
|
6
|
+
item.setAttribute('vx', item.offsetLeft);
|
7
|
+
item.setAttribute('vy', item.offsetTop);
|
8
|
+
item.setAttribute('vw', item.offsetWidth);
|
9
|
+
item.setAttribute('vh', item.offsetHeight);
|
10
|
+
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
|
11
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
12
|
+
if (weight == 'normal') weight = 400;
|
13
|
+
if (weight == 'bold') weight = 700;
|
14
|
+
item.setAttribute('vweight', weight);
|
15
|
+
item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
|
16
|
+
item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
1
20
|
$(document).ready(function(){
|
2
21
|
$("body").append("<div id='myTrees'></div>")
|
3
22
|
$("#page > *").bind('mouseover', function(e){
|
@@ -42,3 +61,5 @@ $(document).ready(function(){
|
|
42
61
|
myTrees.appendChild(li);
|
43
62
|
});
|
44
63
|
});
|
64
|
+
|
65
|
+
window.scrappy_loaded = true
|
@@ -0,0 +1,171 @@
|
|
1
|
+
body {
|
2
|
+
font-family: Arial, sans;
|
3
|
+
margin: 0;
|
4
|
+
}
|
5
|
+
pre {
|
6
|
+
width: 600px;
|
7
|
+
margin-left: auto;
|
8
|
+
margin-right: auto;
|
9
|
+
border: 1px solid;
|
10
|
+
padding: 10px;
|
11
|
+
}
|
12
|
+
a:link, a:visited {
|
13
|
+
color: #33f;
|
14
|
+
text-decoration: none;
|
15
|
+
}
|
16
|
+
a:hover, a:active {
|
17
|
+
color: #33f;
|
18
|
+
text-decoration: underline;
|
19
|
+
}
|
20
|
+
h1, h2, h3, h4 {
|
21
|
+
color: #777;
|
22
|
+
}
|
23
|
+
h2 {
|
24
|
+
font-weight: normal;
|
25
|
+
padding-bottom: 3px;
|
26
|
+
padding-top: 5px;
|
27
|
+
border-bottom: 1px solid #aaa;
|
28
|
+
}
|
29
|
+
img {
|
30
|
+
border: none;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
#bar {
|
35
|
+
height: 33px;
|
36
|
+
font-size: 13px;
|
37
|
+
border-bottom: 1px solid;
|
38
|
+
background-color: #eee;
|
39
|
+
color: #ddd;
|
40
|
+
margin: 0;
|
41
|
+
padding: 0;
|
42
|
+
}
|
43
|
+
#bar ul {
|
44
|
+
display: block;
|
45
|
+
margin: 0 5px 0 5px;
|
46
|
+
padding: 0;
|
47
|
+
list-style-type: none;
|
48
|
+
}
|
49
|
+
#bar li {
|
50
|
+
display: inline-block;
|
51
|
+
margin: 0;
|
52
|
+
padding: 0;
|
53
|
+
}
|
54
|
+
#bar a {
|
55
|
+
position: relative;
|
56
|
+
margin-left: -4px;
|
57
|
+
height: 14px;
|
58
|
+
padding: 10px;
|
59
|
+
display: block;
|
60
|
+
}
|
61
|
+
#bar a:hover {
|
62
|
+
background-color: #ddd;
|
63
|
+
text-decoration: none;
|
64
|
+
}
|
65
|
+
#bar img {
|
66
|
+
position: relative;
|
67
|
+
top: -3px;
|
68
|
+
vertical-align: middle;
|
69
|
+
}
|
70
|
+
#bar ul.left {
|
71
|
+
float: left;
|
72
|
+
}
|
73
|
+
#bar ul.right {
|
74
|
+
float: right;
|
75
|
+
}
|
76
|
+
#column {
|
77
|
+
float:left;
|
78
|
+
margin-top: 20px;
|
79
|
+
height: auto;
|
80
|
+
width: 200px;
|
81
|
+
height: 600px;
|
82
|
+
border-right: 1px solid #ddd;
|
83
|
+
overflow-y: auto;
|
84
|
+
}
|
85
|
+
#body {
|
86
|
+
margin: auto; width: 800px; padding: 15px;
|
87
|
+
font-size: 14px; color: #333;
|
88
|
+
}
|
89
|
+
#center {
|
90
|
+
text-align: center;
|
91
|
+
margin-top: 100px;
|
92
|
+
margin-bottom: 100px;
|
93
|
+
}
|
94
|
+
#search {
|
95
|
+
margin-top: 40px;
|
96
|
+
font-size:20px;
|
97
|
+
margin-bottom: 10px;
|
98
|
+
}
|
99
|
+
#search input {
|
100
|
+
width: 700px; height:30px; font-size:16px;
|
101
|
+
}
|
102
|
+
#buttons {
|
103
|
+
width: 400px;
|
104
|
+
margin: auto;
|
105
|
+
}
|
106
|
+
#buttons select {
|
107
|
+
width: 100px; height: 30px; font-size:16px;
|
108
|
+
margin-left: 5px;
|
109
|
+
}
|
110
|
+
#buttons button {
|
111
|
+
width: 100px; height: 30px; font-size:16px;
|
112
|
+
margin-right: 5px;
|
113
|
+
}
|
114
|
+
#footer {
|
115
|
+
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
116
|
+
height: 50px;
|
117
|
+
}
|
118
|
+
|
119
|
+
.bookmark,
|
120
|
+
a.bookmark:hover,
|
121
|
+
a.bookmark:visited,
|
122
|
+
a.bookmark:link,
|
123
|
+
a.bookmark:active {
|
124
|
+
font-size: 11px;
|
125
|
+
text-decoration: none;
|
126
|
+
border-radius: 7px;
|
127
|
+
padding: 6px;
|
128
|
+
background-color: #ccc;
|
129
|
+
background: -webkit-gradient(linear, left top, left bottom, from(#d7d7d7), to(#ababab));
|
130
|
+
background: -moz-linear-gradient(top, #d7d7d7, #ababab);
|
131
|
+
border-color: #c9c9c9;
|
132
|
+
border-bottom-color: #9a9a9a;
|
133
|
+
border-width: 1px;
|
134
|
+
border-style: solid;
|
135
|
+
margin: 5px;
|
136
|
+
color: #444;
|
137
|
+
-moz-border-radius: 7px;
|
138
|
+
}
|
139
|
+
|
140
|
+
ul.detail {
|
141
|
+
padding: 0;
|
142
|
+
list-style-type: none;
|
143
|
+
}
|
144
|
+
ul.detail li {
|
145
|
+
padding: 6px;
|
146
|
+
background-color: #eee;
|
147
|
+
margin: 1px;
|
148
|
+
}
|
149
|
+
ul.detail li span {
|
150
|
+
display: inline-block;
|
151
|
+
}
|
152
|
+
ul.detail li span.name {
|
153
|
+
width: 600px;
|
154
|
+
overflow-x: hidden;
|
155
|
+
font-family: monospace;
|
156
|
+
font-size: 12px;
|
157
|
+
}
|
158
|
+
ul.detail li span.format {
|
159
|
+
float: right;
|
160
|
+
font-size: 12px;
|
161
|
+
font-weight: bold;
|
162
|
+
margin-left: 10px;
|
163
|
+
text-align: center;
|
164
|
+
}
|
165
|
+
|
166
|
+
ul.detail li span.format a:hover,
|
167
|
+
ul.detail li span.format a:link,
|
168
|
+
ul.detail li span.format a:visited,
|
169
|
+
ul.detail li span.format a:active {
|
170
|
+
color: #900;
|
171
|
+
}
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.2.
|
5
|
+
s.version = "0.2.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-11}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
data/views/help.haml
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Help
|
3
|
+
%p
|
4
|
+
Scrappy Web Server is a web interface to Scrappy functionalities.
|
5
|
+
You can use it to get data from a web resource and integrate it with other system.
|
6
|
+
%h2 URL format
|
7
|
+
%p
|
8
|
+
The service uses the following URL format:
|
9
|
+
%pre http://[host]/[format]/[url]
|
10
|
+
%p
|
11
|
+
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
12
|
+
%pre==#{settings.base_uri || "http://localhost:#{settings.port}"}/rdf/http://example.com/~user/%3Ftest%3D1
|
13
|
+
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
14
|
+
%h2 Creating extractors
|
15
|
+
%p
|
16
|
+
Drag this to your bookmarks:
|
17
|
+
%a.bookmark{:href=>bookmark_js, :onclick=>drag_js} Scrappy
|
18
|
+
%p
|
19
|
+
Then visit the web page you want to build a extractor for.
|
20
|
+
Click on your "Scrappy" bookmark and annotate the web page.
|
data/views/home.haml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#center
|
2
|
+
%a{:href=>'http://github.com/josei/scrappy'}
|
3
|
+
%img{:src=>"#{settings.base_uri}/images/logo.png"}
|
4
|
+
%form
|
5
|
+
#search
|
6
|
+
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
7
|
+
#buttons
|
8
|
+
%button Scrape
|
9
|
+
%select{:name=>:format}
|
10
|
+
%option{:value=>:rdf} RDF
|
11
|
+
%option{:value=>:png} PNG
|
12
|
+
%option{:value=>:ejson} JSON
|
13
|
+
%option{:value=>:yarf} YARF
|
14
|
+
%option{:value=>:ntriples} nTriples
|
data/views/kb.haml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Knowledge base
|
3
|
+
%p
|
4
|
+
%ul.detail
|
5
|
+
-@uris.each do |uri|
|
6
|
+
%li
|
7
|
+
%span.name
|
8
|
+
-if !uri.include?('*')
|
9
|
+
%a{:href=>uri}=uri
|
10
|
+
-else
|
11
|
+
=uri
|
12
|
+
-if !uri.include?('*')
|
13
|
+
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['PNG', :png]].reverse.each do |format, format_code|
|
14
|
+
%span.format
|
15
|
+
%a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
|
data/views/layout.haml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
!!!
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%title Scrappy
|
5
|
+
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%body
|
7
|
+
#bar
|
8
|
+
-if request.fullpath!='/'
|
9
|
+
%ul.left
|
10
|
+
%li
|
11
|
+
%a{:href=>"#{settings.base_uri}/"}
|
12
|
+
%img{:src=>"#{settings.base_uri}/images/logo_tiny.png", :alt=>"Scrappy"}
|
13
|
+
%ul.right
|
14
|
+
%li
|
15
|
+
%a{:href=>"#{settings.base_uri}/kb"} Knowledge base
|
16
|
+
%li
|
17
|
+
%a{:href=>"#{settings.base_uri}/help"} Help
|
18
|
+
=yield
|
19
|
+
#footer
|
20
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
21
|
+
|
|
22
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-11 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -152,16 +152,16 @@ extensions: []
|
|
152
152
|
extra_rdoc_files:
|
153
153
|
- README.rdoc
|
154
154
|
- bin/scrappy
|
155
|
-
- lib/js/annotator.js
|
156
155
|
- lib/scrappy.rb
|
157
156
|
- lib/scrappy/agent/agent.rb
|
158
157
|
- lib/scrappy/agent/blind_agent.rb
|
159
158
|
- lib/scrappy/agent/cache.rb
|
160
159
|
- lib/scrappy/agent/dumper.rb
|
160
|
+
- lib/scrappy/agent/extractor.rb
|
161
161
|
- lib/scrappy/agent/formats.rb
|
162
162
|
- lib/scrappy/agent/map_reduce.rb
|
163
|
-
- lib/scrappy/agent/extractor.rb
|
164
163
|
- lib/scrappy/agent/visual_agent.rb
|
164
|
+
- lib/scrappy/repository.rb
|
165
165
|
- lib/scrappy/selectors/base_uri.rb
|
166
166
|
- lib/scrappy/selectors/css.rb
|
167
167
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -171,14 +171,9 @@ extra_rdoc_files:
|
|
171
171
|
- lib/scrappy/selectors/uri.rb
|
172
172
|
- lib/scrappy/selectors/uri_pattern.rb
|
173
173
|
- lib/scrappy/selectors/xpath.rb
|
174
|
-
- lib/scrappy/server/
|
174
|
+
- lib/scrappy/server/admin.rb
|
175
|
+
- lib/scrappy/server/helpers.rb
|
175
176
|
- lib/scrappy/server/server.rb
|
176
|
-
- lib/scrappy/server/public/images/logo.png
|
177
|
-
- lib/scrappy/server/public/images/logo_small.png
|
178
|
-
- lib/scrappy/server/public/stylesheets/application.css
|
179
|
-
- lib/scrappy/server/views/home.haml
|
180
|
-
- lib/scrappy/server/views/help.haml
|
181
|
-
- lib/scrappy/repository.rb
|
182
177
|
- lib/scrappy/shell.rb
|
183
178
|
- lib/scrappy/support.rb
|
184
179
|
- lib/scrappy/webkit/webkit.rb
|
@@ -189,16 +184,16 @@ files:
|
|
189
184
|
- Rakefile
|
190
185
|
- bin/scrappy
|
191
186
|
- kb/elmundo.yarf
|
192
|
-
- lib/js/annotator.js
|
193
187
|
- lib/scrappy.rb
|
194
188
|
- lib/scrappy/agent/agent.rb
|
195
189
|
- lib/scrappy/agent/blind_agent.rb
|
196
190
|
- lib/scrappy/agent/cache.rb
|
197
191
|
- lib/scrappy/agent/dumper.rb
|
192
|
+
- lib/scrappy/agent/extractor.rb
|
198
193
|
- lib/scrappy/agent/formats.rb
|
199
194
|
- lib/scrappy/agent/map_reduce.rb
|
200
|
-
- lib/scrappy/agent/extractor.rb
|
201
195
|
- lib/scrappy/agent/visual_agent.rb
|
196
|
+
- lib/scrappy/repository.rb
|
202
197
|
- lib/scrappy/selectors/base_uri.rb
|
203
198
|
- lib/scrappy/selectors/css.rb
|
204
199
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -208,19 +203,23 @@ files:
|
|
208
203
|
- lib/scrappy/selectors/uri.rb
|
209
204
|
- lib/scrappy/selectors/uri_pattern.rb
|
210
205
|
- lib/scrappy/selectors/xpath.rb
|
211
|
-
- lib/scrappy/server/
|
206
|
+
- lib/scrappy/server/admin.rb
|
207
|
+
- lib/scrappy/server/helpers.rb
|
212
208
|
- lib/scrappy/server/server.rb
|
213
|
-
- lib/scrappy/server/public/images/logo.png
|
214
|
-
- lib/scrappy/server/public/images/logo_small.png
|
215
|
-
- lib/scrappy/server/public/stylesheets/application.css
|
216
|
-
- lib/scrappy/server/views/home.haml
|
217
|
-
- lib/scrappy/server/views/help.haml
|
218
|
-
- lib/scrappy/repository.rb
|
219
209
|
- lib/scrappy/shell.rb
|
220
210
|
- lib/scrappy/support.rb
|
221
211
|
- lib/scrappy/webkit/webkit.rb
|
212
|
+
- public/favicon.ico
|
213
|
+
- public/images/logo.png
|
214
|
+
- public/images/logo_tiny.png
|
215
|
+
- public/javascripts/scrappy.js
|
216
|
+
- public/stylesheets/application.css
|
222
217
|
- test/test_helper.rb
|
223
218
|
- test/test_scrappy.rb
|
219
|
+
- views/help.haml
|
220
|
+
- views/home.haml
|
221
|
+
- views/kb.haml
|
222
|
+
- views/layout.haml
|
224
223
|
- scrappy.gemspec
|
225
224
|
has_rdoc: true
|
226
225
|
homepage: http://github.com/josei/scrappy
|
data/lib/scrappy/server/proxy.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'sinatra'
|
2
|
-
require 'thin'
|
3
|
-
|
4
|
-
module Scrappy
|
5
|
-
class Proxy < Sinatra::Base
|
6
|
-
get '*' do
|
7
|
-
process_request :get
|
8
|
-
end
|
9
|
-
|
10
|
-
post '*' do
|
11
|
-
process_request :post
|
12
|
-
end
|
13
|
-
|
14
|
-
protected
|
15
|
-
def process_request method
|
16
|
-
response = agent.proxy :method=>method, :uri=>request.env['REQUEST_URI'], :inputs=>params
|
17
|
-
|
18
|
-
case response.status
|
19
|
-
when :redirect
|
20
|
-
redirect response.uri
|
21
|
-
when :ok
|
22
|
-
headers 'Content-Type' => response.content_type
|
23
|
-
response.output
|
24
|
-
else
|
25
|
-
status 500
|
26
|
-
"Internal error"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def agent
|
31
|
-
Scrappy::Agent[request.ip]
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
Binary file
|
@@ -1,51 +0,0 @@
|
|
1
|
-
body {
|
2
|
-
font-family: Arial, sans;
|
3
|
-
}
|
4
|
-
#center {
|
5
|
-
text-align: center;
|
6
|
-
margin-top: 100px;
|
7
|
-
margin-bottom: 100px;
|
8
|
-
}
|
9
|
-
#search {
|
10
|
-
margin-top: 40px;
|
11
|
-
font-size:20px;
|
12
|
-
margin-bottom: 10px;
|
13
|
-
}
|
14
|
-
#search input {
|
15
|
-
width: 700px; height:30px; font-size:16px;
|
16
|
-
}
|
17
|
-
#buttons {
|
18
|
-
width: 400px;
|
19
|
-
margin: auto;
|
20
|
-
}
|
21
|
-
#buttons select {
|
22
|
-
width: 100px; height: 30px; font-size:16px;
|
23
|
-
margin-left: 5px;
|
24
|
-
}
|
25
|
-
#buttons button {
|
26
|
-
width: 100px; height: 30px; font-size:16px;
|
27
|
-
margin-right: 5px;
|
28
|
-
}
|
29
|
-
|
30
|
-
pre {
|
31
|
-
width: 600px;
|
32
|
-
margin-left: auto;
|
33
|
-
margin-right: auto;
|
34
|
-
border: 1px solid;
|
35
|
-
padding:10px;
|
36
|
-
}
|
37
|
-
#header {
|
38
|
-
margin: auto; width: 800px; padding: 15px;
|
39
|
-
border-bottom: 1px solid;
|
40
|
-
margin-top: 20px; font-size: 14px; color: #555;
|
41
|
-
}
|
42
|
-
#body {
|
43
|
-
margin: auto; width: 800px; padding: 15px;
|
44
|
-
font-size: 14px; color: #555;
|
45
|
-
}
|
46
|
-
#footer {
|
47
|
-
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
48
|
-
}
|
49
|
-
img {
|
50
|
-
border: none;
|
51
|
-
}
|
@@ -1,25 +0,0 @@
|
|
1
|
-
!!!
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%title Help - Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
-
%body
|
7
|
-
%div#header
|
8
|
-
%img{:src=>"#{settings.base_uri}/images/logo_small.png"}
|
9
|
-
%div#body
|
10
|
-
%h1 Help
|
11
|
-
%p
|
12
|
-
Scrappy web server is a web interface to Scrappy functionalities.
|
13
|
-
You can use it to get data from a web resource and integrate it with other system.
|
14
|
-
The service uses the following URL format:
|
15
|
-
%pre http://[host]/[format]/[url]
|
16
|
-
%p
|
17
|
-
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
18
|
-
%pre==#{settings.base_uri || "http://localhost:#{settings.port}"}/rdf/http://example.com/~user/%3Ftest%3D1
|
19
|
-
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
20
|
-
%div#footer
|
21
|
-
%a{:href=>"#{settings.base_uri}/"} Home
|
22
|
-
|
|
23
|
-
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
|
-
|
|
25
|
-
%a{:href=>'http://github.com/josei/scrappy'} About
|
@@ -1,26 +0,0 @@
|
|
1
|
-
!!!
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%title Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
-
%body
|
7
|
-
%div#center
|
8
|
-
%a{:href=>'http://github.com/josei/scrappy'}
|
9
|
-
%img{:src=>"#{settings.base_uri}/images/logo.png"}
|
10
|
-
%form
|
11
|
-
%div#search
|
12
|
-
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
13
|
-
%div#buttons
|
14
|
-
%button Scrape
|
15
|
-
%select{:name=>:format}
|
16
|
-
%option{:value=>:rdf} RDF
|
17
|
-
%option{:value=>:png} PNG
|
18
|
-
%option{:value=>:ejson} JSON
|
19
|
-
%option{:value=>:yarf} YARF
|
20
|
-
%option{:value=>:ntriples} nTriples
|
21
|
-
%div#footer
|
22
|
-
%a{:href=>"#{settings.base_uri}/"} Home
|
23
|
-
|
|
24
|
-
%a{:href=>"#{settings.base_uri}/help"} Help
|
25
|
-
|
|
26
|
-
%a{:href=>'http://github.com/josei/scrappy'} About
|