scrappy 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Manifest +13 -9
- data/README.rdoc +26 -26
- data/bin/scrappy +12 -9
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +2 -1
- data/lib/scrappy/agent/visual_agent.rb +2 -1
- data/lib/scrappy/server/admin.rb +24 -0
- data/lib/scrappy/server/helpers.rb +23 -0
- data/lib/scrappy/server/server.rb +5 -13
- data/public/favicon.ico +0 -0
- data/{lib/scrappy/server/public → public}/images/logo.png +0 -0
- data/public/images/logo_tiny.png +0 -0
- data/{lib/js/annotator.js → public/javascripts/scrappy.js} +21 -0
- data/public/stylesheets/application.css +171 -0
- data/scrappy.gemspec +4 -4
- data/views/help.haml +20 -0
- data/views/home.haml +14 -0
- data/views/kb.haml +15 -0
- data/views/layout.haml +22 -0
- metadata +20 -21
- data/lib/scrappy/server/proxy.rb +0 -34
- data/lib/scrappy/server/public/images/logo_small.png +0 -0
- data/lib/scrappy/server/public/stylesheets/application.css +0 -51
- data/lib/scrappy/server/views/help.haml +0 -25
- data/lib/scrappy/server/views/home.haml +0 -26
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -4,16 +4,16 @@ README.rdoc
|
|
4
4
|
Rakefile
|
5
5
|
bin/scrappy
|
6
6
|
kb/elmundo.yarf
|
7
|
-
lib/js/annotator.js
|
8
7
|
lib/scrappy.rb
|
9
8
|
lib/scrappy/agent/agent.rb
|
10
9
|
lib/scrappy/agent/blind_agent.rb
|
11
10
|
lib/scrappy/agent/cache.rb
|
12
11
|
lib/scrappy/agent/dumper.rb
|
12
|
+
lib/scrappy/agent/extractor.rb
|
13
13
|
lib/scrappy/agent/formats.rb
|
14
14
|
lib/scrappy/agent/map_reduce.rb
|
15
|
-
lib/scrappy/agent/extractor.rb
|
16
15
|
lib/scrappy/agent/visual_agent.rb
|
16
|
+
lib/scrappy/repository.rb
|
17
17
|
lib/scrappy/selectors/base_uri.rb
|
18
18
|
lib/scrappy/selectors/css.rb
|
19
19
|
lib/scrappy/selectors/new_uri.rb
|
@@ -23,16 +23,20 @@ lib/scrappy/selectors/slice.rb
|
|
23
23
|
lib/scrappy/selectors/uri.rb
|
24
24
|
lib/scrappy/selectors/uri_pattern.rb
|
25
25
|
lib/scrappy/selectors/xpath.rb
|
26
|
-
lib/scrappy/server/
|
26
|
+
lib/scrappy/server/admin.rb
|
27
|
+
lib/scrappy/server/helpers.rb
|
27
28
|
lib/scrappy/server/server.rb
|
28
|
-
lib/scrappy/server/public/images/logo.png
|
29
|
-
lib/scrappy/server/public/images/logo_small.png
|
30
|
-
lib/scrappy/server/public/stylesheets/application.css
|
31
|
-
lib/scrappy/server/views/home.haml
|
32
|
-
lib/scrappy/server/views/help.haml
|
33
|
-
lib/scrappy/repository.rb
|
34
29
|
lib/scrappy/shell.rb
|
35
30
|
lib/scrappy/support.rb
|
36
31
|
lib/scrappy/webkit/webkit.rb
|
32
|
+
public/favicon.ico
|
33
|
+
public/images/logo.png
|
34
|
+
public/images/logo_tiny.png
|
35
|
+
public/javascripts/scrappy.js
|
36
|
+
public/stylesheets/application.css
|
37
37
|
test/test_helper.rb
|
38
38
|
test/test_scrappy.rb
|
39
|
+
views/help.haml
|
40
|
+
views/home.haml
|
41
|
+
views/kb.haml
|
42
|
+
views/layout.haml
|
data/README.rdoc
CHANGED
@@ -140,37 +140,37 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
140
140
|
|
141
141
|
* RDF repository:
|
142
142
|
|
143
|
-
Sesame functionality has been included in Scrappy. You can configure
|
144
|
-
the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
|
145
|
-
An example of this file can be found at the end of this README.
|
143
|
+
Sesame functionality has been included in Scrappy. You can configure
|
144
|
+
the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
|
145
|
+
An example of this file can be found at the end of this README.
|
146
146
|
|
147
|
-
You can get the data for a certain period of time, by using the time (-t, --time) option:
|
147
|
+
You can get the data for a certain period of time, by using the time (-t, --time) option:
|
148
148
|
|
149
|
-
|
149
|
+
$ scrappy -g example.org -t 3
|
150
150
|
|
151
|
-
This would get all the data stored in Sesame for example.org in the last 3 minutes.
|
151
|
+
This would get all the data stored in Sesame for example.org in the last 3 minutes.
|
152
152
|
|
153
153
|
* Sample config.yml
|
154
154
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
155
|
+
# This is a sample configuration file, with the options to communicate with Sesame using Scrappy
|
156
|
+
repository:
|
157
|
+
# The host were Sesame is. Do not add the trailing '/'
|
158
|
+
host: http://localhost
|
159
|
+
|
160
|
+
# The port for the connection
|
161
|
+
port: 8080
|
162
|
+
|
163
|
+
# The time to consider the data in the repository valid, in minutes
|
164
|
+
time: 15
|
165
|
+
|
166
|
+
# The name of the repository
|
167
|
+
repository: memory
|
168
|
+
|
169
|
+
# The format to communicate with the repository
|
170
|
+
format: ntriples
|
171
|
+
|
172
|
+
# You can use any of the following formats:
|
173
|
+
# rdfxml, ntriples, turtle, n3, trix, trig
|
174
174
|
|
175
175
|
|
176
176
|
== INSTALL:
|
@@ -222,4 +222,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
222
222
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
223
223
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
224
224
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
225
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
225
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/scrappy
CHANGED
@@ -40,12 +40,12 @@ module Scrappy
|
|
40
40
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
41
41
|
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
42
42
|
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
43
|
-
opts.on('-
|
43
|
+
opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
|
44
44
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
45
45
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
46
46
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
47
47
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
48
|
-
opts.on('-V', '--visual') { Agent::Options.agent = :visual
|
48
|
+
opts.on('-V', '--visual') { Agent::Options.agent = :visual }
|
49
49
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
50
50
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
51
51
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
@@ -63,16 +63,19 @@ module Scrappy
|
|
63
63
|
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
|
64
64
|
elsif Options.observe
|
65
65
|
Agent.create.observe(Options.observe)
|
66
|
-
elsif Options.
|
67
|
-
puts "Launching Scrappy Web
|
68
|
-
require 'scrappy/server/
|
66
|
+
elsif Options.admin
|
67
|
+
puts "Launching Scrappy Admin Web Server (browse http://localhost:#{Options.port})..."
|
68
|
+
require 'scrappy/server/server'
|
69
69
|
Thin::Logging.silent = true
|
70
|
-
Scrappy::
|
70
|
+
Scrappy::Server.register Scrappy::Admin
|
71
|
+
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
72
|
+
:base_uri=>Options.base_uri
|
71
73
|
elsif Options.server
|
72
|
-
puts "Launching Scrappy Web Server
|
74
|
+
puts "Launching Scrappy Web Server..."
|
73
75
|
require 'scrappy/server/server'
|
74
76
|
Thin::Logging.silent = true
|
75
|
-
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment
|
77
|
+
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
78
|
+
:base_uri => Options.base_uri
|
76
79
|
elsif Options.shell
|
77
80
|
puts "Launching Scrappy Shell..."
|
78
81
|
require 'scrappy/shell'
|
@@ -123,7 +126,7 @@ Options
|
|
123
126
|
-w, --window Shows browser window (requires -v)
|
124
127
|
|
125
128
|
Authors
|
126
|
-
José Ignacio Fernández, Jacobo Blasco
|
129
|
+
José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
|
127
130
|
|
128
131
|
Copyright
|
129
132
|
Copyright (c) 2010 José Ignacio Fernández. Licensed under the MIT License:
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -24,6 +24,7 @@ module Scrappy
|
|
24
24
|
@window.show_all
|
25
25
|
@visible = true
|
26
26
|
end
|
27
|
+
@mechanize = Mechanize.new
|
27
28
|
end
|
28
29
|
|
29
30
|
def uri
|
@@ -33,7 +34,7 @@ module Scrappy
|
|
33
34
|
def uri= uri
|
34
35
|
# First, check if the requested uri is a valid HTML page
|
35
36
|
valid = begin
|
36
|
-
|
37
|
+
@mechanize.get(uri).is_a?(Mechanize::Page)
|
37
38
|
rescue
|
38
39
|
false
|
39
40
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module Admin
|
3
|
+
def self.registered app
|
4
|
+
app.get '/' do
|
5
|
+
if params[:format] and params[:uri]
|
6
|
+
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
|
7
|
+
else
|
8
|
+
haml :home
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
app.get '/help' do
|
13
|
+
haml :help
|
14
|
+
end
|
15
|
+
|
16
|
+
app.get '/kb' do
|
17
|
+
@uris = ( Agent::Options.kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
|
18
|
+
Agent::Options.kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
|
19
|
+
map { |node| node.rdf::value }.flatten.sort.map(&:to_s)
|
20
|
+
haml :kb
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module JavaScriptHelpers
|
3
|
+
def bookmark_js
|
4
|
+
"javascript:(function(){" +
|
5
|
+
"if(!document.getElementById('scrappy')){" +
|
6
|
+
"var e=document.createElement('script');" +
|
7
|
+
"e.src='https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js';" +
|
8
|
+
"e.id='scrappy';" +
|
9
|
+
"document.getElementsByTagName('head')[0].appendChild(e);};" +
|
10
|
+
"if(!window.scrappy_loaded){" +
|
11
|
+
"e=document.createElement('script');" +
|
12
|
+
"e.src='http://localhost:3434/javascripts/scrappy.js?_=#{Time.now.to_i}';" +
|
13
|
+
"e.onerror=function(){alert('Error: Please start Scrappy Server at http://localhost:3434');};" +
|
14
|
+
"document.getElementsByTagName('head')[0].appendChild(e);" +
|
15
|
+
"}"+
|
16
|
+
"})();"
|
17
|
+
end
|
18
|
+
|
19
|
+
def drag_js
|
20
|
+
"alert(\"Don't click this. Drag it to your bookmarks\"); return false;"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -1,26 +1,18 @@
|
|
1
1
|
require 'sinatra'
|
2
2
|
require 'thin'
|
3
3
|
require 'haml'
|
4
|
+
require 'scrappy/server/helpers'
|
5
|
+
require 'scrappy/server/admin'
|
4
6
|
|
5
7
|
module Scrappy
|
6
8
|
class Server < Sinatra::Base
|
9
|
+
helpers JavaScriptHelpers
|
10
|
+
|
7
11
|
enable :sessions
|
8
|
-
set :root, File.dirname(__FILE__)
|
12
|
+
set :root, File.join(File.dirname(__FILE__), '..', '..', '..')
|
9
13
|
set :views, Proc.new { File.join(root, "views") }
|
10
14
|
set :public, Proc.new { File.join(root, "public") }
|
11
15
|
|
12
|
-
get '/' do
|
13
|
-
if params[:format] and params[:uri]
|
14
|
-
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
|
15
|
-
else
|
16
|
-
haml :home
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
get '/help' do
|
21
|
-
haml :help
|
22
|
-
end
|
23
|
-
|
24
16
|
get '/:format/*' do |format, url|
|
25
17
|
process_request :get, format, url, params[:callback]
|
26
18
|
end
|
data/public/favicon.ico
ADDED
Binary file
|
File without changes
|
Binary file
|
@@ -1,3 +1,22 @@
|
|
1
|
+
add_visual_data = function() {
|
2
|
+
var items = document.documentElement.getElementsByTagName('*');
|
3
|
+
var i=0;
|
4
|
+
for(var i=0; i<items.length; i++) {
|
5
|
+
var item = items[i];
|
6
|
+
item.setAttribute('vx', item.offsetLeft);
|
7
|
+
item.setAttribute('vy', item.offsetTop);
|
8
|
+
item.setAttribute('vw', item.offsetWidth);
|
9
|
+
item.setAttribute('vh', item.offsetHeight);
|
10
|
+
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
|
11
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
12
|
+
if (weight == 'normal') weight = 400;
|
13
|
+
if (weight == 'bold') weight = 700;
|
14
|
+
item.setAttribute('vweight', weight);
|
15
|
+
item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
|
16
|
+
item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
1
20
|
$(document).ready(function(){
|
2
21
|
$("body").append("<div id='myTrees'></div>")
|
3
22
|
$("#page > *").bind('mouseover', function(e){
|
@@ -42,3 +61,5 @@ $(document).ready(function(){
|
|
42
61
|
myTrees.appendChild(li);
|
43
62
|
});
|
44
63
|
});
|
64
|
+
|
65
|
+
window.scrappy_loaded = true
|
@@ -0,0 +1,171 @@
|
|
1
|
+
body {
|
2
|
+
font-family: Arial, sans;
|
3
|
+
margin: 0;
|
4
|
+
}
|
5
|
+
pre {
|
6
|
+
width: 600px;
|
7
|
+
margin-left: auto;
|
8
|
+
margin-right: auto;
|
9
|
+
border: 1px solid;
|
10
|
+
padding: 10px;
|
11
|
+
}
|
12
|
+
a:link, a:visited {
|
13
|
+
color: #33f;
|
14
|
+
text-decoration: none;
|
15
|
+
}
|
16
|
+
a:hover, a:active {
|
17
|
+
color: #33f;
|
18
|
+
text-decoration: underline;
|
19
|
+
}
|
20
|
+
h1, h2, h3, h4 {
|
21
|
+
color: #777;
|
22
|
+
}
|
23
|
+
h2 {
|
24
|
+
font-weight: normal;
|
25
|
+
padding-bottom: 3px;
|
26
|
+
padding-top: 5px;
|
27
|
+
border-bottom: 1px solid #aaa;
|
28
|
+
}
|
29
|
+
img {
|
30
|
+
border: none;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
#bar {
|
35
|
+
height: 33px;
|
36
|
+
font-size: 13px;
|
37
|
+
border-bottom: 1px solid;
|
38
|
+
background-color: #eee;
|
39
|
+
color: #ddd;
|
40
|
+
margin: 0;
|
41
|
+
padding: 0;
|
42
|
+
}
|
43
|
+
#bar ul {
|
44
|
+
display: block;
|
45
|
+
margin: 0 5px 0 5px;
|
46
|
+
padding: 0;
|
47
|
+
list-style-type: none;
|
48
|
+
}
|
49
|
+
#bar li {
|
50
|
+
display: inline-block;
|
51
|
+
margin: 0;
|
52
|
+
padding: 0;
|
53
|
+
}
|
54
|
+
#bar a {
|
55
|
+
position: relative;
|
56
|
+
margin-left: -4px;
|
57
|
+
height: 14px;
|
58
|
+
padding: 10px;
|
59
|
+
display: block;
|
60
|
+
}
|
61
|
+
#bar a:hover {
|
62
|
+
background-color: #ddd;
|
63
|
+
text-decoration: none;
|
64
|
+
}
|
65
|
+
#bar img {
|
66
|
+
position: relative;
|
67
|
+
top: -3px;
|
68
|
+
vertical-align: middle;
|
69
|
+
}
|
70
|
+
#bar ul.left {
|
71
|
+
float: left;
|
72
|
+
}
|
73
|
+
#bar ul.right {
|
74
|
+
float: right;
|
75
|
+
}
|
76
|
+
#column {
|
77
|
+
float:left;
|
78
|
+
margin-top: 20px;
|
79
|
+
height: auto;
|
80
|
+
width: 200px;
|
81
|
+
height: 600px;
|
82
|
+
border-right: 1px solid #ddd;
|
83
|
+
overflow-y: auto;
|
84
|
+
}
|
85
|
+
#body {
|
86
|
+
margin: auto; width: 800px; padding: 15px;
|
87
|
+
font-size: 14px; color: #333;
|
88
|
+
}
|
89
|
+
#center {
|
90
|
+
text-align: center;
|
91
|
+
margin-top: 100px;
|
92
|
+
margin-bottom: 100px;
|
93
|
+
}
|
94
|
+
#search {
|
95
|
+
margin-top: 40px;
|
96
|
+
font-size:20px;
|
97
|
+
margin-bottom: 10px;
|
98
|
+
}
|
99
|
+
#search input {
|
100
|
+
width: 700px; height:30px; font-size:16px;
|
101
|
+
}
|
102
|
+
#buttons {
|
103
|
+
width: 400px;
|
104
|
+
margin: auto;
|
105
|
+
}
|
106
|
+
#buttons select {
|
107
|
+
width: 100px; height: 30px; font-size:16px;
|
108
|
+
margin-left: 5px;
|
109
|
+
}
|
110
|
+
#buttons button {
|
111
|
+
width: 100px; height: 30px; font-size:16px;
|
112
|
+
margin-right: 5px;
|
113
|
+
}
|
114
|
+
#footer {
|
115
|
+
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
116
|
+
height: 50px;
|
117
|
+
}
|
118
|
+
|
119
|
+
.bookmark,
|
120
|
+
a.bookmark:hover,
|
121
|
+
a.bookmark:visited,
|
122
|
+
a.bookmark:link,
|
123
|
+
a.bookmark:active {
|
124
|
+
font-size: 11px;
|
125
|
+
text-decoration: none;
|
126
|
+
border-radius: 7px;
|
127
|
+
padding: 6px;
|
128
|
+
background-color: #ccc;
|
129
|
+
background: -webkit-gradient(linear, left top, left bottom, from(#d7d7d7), to(#ababab));
|
130
|
+
background: -moz-linear-gradient(top, #d7d7d7, #ababab);
|
131
|
+
border-color: #c9c9c9;
|
132
|
+
border-bottom-color: #9a9a9a;
|
133
|
+
border-width: 1px;
|
134
|
+
border-style: solid;
|
135
|
+
margin: 5px;
|
136
|
+
color: #444;
|
137
|
+
-moz-border-radius: 7px;
|
138
|
+
}
|
139
|
+
|
140
|
+
ul.detail {
|
141
|
+
padding: 0;
|
142
|
+
list-style-type: none;
|
143
|
+
}
|
144
|
+
ul.detail li {
|
145
|
+
padding: 6px;
|
146
|
+
background-color: #eee;
|
147
|
+
margin: 1px;
|
148
|
+
}
|
149
|
+
ul.detail li span {
|
150
|
+
display: inline-block;
|
151
|
+
}
|
152
|
+
ul.detail li span.name {
|
153
|
+
width: 600px;
|
154
|
+
overflow-x: hidden;
|
155
|
+
font-family: monospace;
|
156
|
+
font-size: 12px;
|
157
|
+
}
|
158
|
+
ul.detail li span.format {
|
159
|
+
float: right;
|
160
|
+
font-size: 12px;
|
161
|
+
font-weight: bold;
|
162
|
+
margin-left: 10px;
|
163
|
+
text-align: center;
|
164
|
+
}
|
165
|
+
|
166
|
+
ul.detail li span.format a:hover,
|
167
|
+
ul.detail li span.format a:link,
|
168
|
+
ul.detail li span.format a:visited,
|
169
|
+
ul.detail li span.format a:active {
|
170
|
+
color: #900;
|
171
|
+
}
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.2.
|
5
|
+
s.version = "0.2.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-11}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
data/views/help.haml
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Help
|
3
|
+
%p
|
4
|
+
Scrappy Web Server is a web interface to Scrappy functionalities.
|
5
|
+
You can use it to get data from a web resource and integrate it with other system.
|
6
|
+
%h2 URL format
|
7
|
+
%p
|
8
|
+
The service uses the following URL format:
|
9
|
+
%pre http://[host]/[format]/[url]
|
10
|
+
%p
|
11
|
+
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
12
|
+
%pre==#{settings.base_uri || "http://localhost:#{settings.port}"}/rdf/http://example.com/~user/%3Ftest%3D1
|
13
|
+
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
14
|
+
%h2 Creating extractors
|
15
|
+
%p
|
16
|
+
Drag this to your bookmarks:
|
17
|
+
%a.bookmark{:href=>bookmark_js, :onclick=>drag_js} Scrappy
|
18
|
+
%p
|
19
|
+
Then visit the web page you want to build a extractor for.
|
20
|
+
Click on your "Scrappy" bookmark and annotate the web page.
|
data/views/home.haml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#center
|
2
|
+
%a{:href=>'http://github.com/josei/scrappy'}
|
3
|
+
%img{:src=>"#{settings.base_uri}/images/logo.png"}
|
4
|
+
%form
|
5
|
+
#search
|
6
|
+
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
7
|
+
#buttons
|
8
|
+
%button Scrape
|
9
|
+
%select{:name=>:format}
|
10
|
+
%option{:value=>:rdf} RDF
|
11
|
+
%option{:value=>:png} PNG
|
12
|
+
%option{:value=>:ejson} JSON
|
13
|
+
%option{:value=>:yarf} YARF
|
14
|
+
%option{:value=>:ntriples} nTriples
|
data/views/kb.haml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Knowledge base
|
3
|
+
%p
|
4
|
+
%ul.detail
|
5
|
+
-@uris.each do |uri|
|
6
|
+
%li
|
7
|
+
%span.name
|
8
|
+
-if !uri.include?('*')
|
9
|
+
%a{:href=>uri}=uri
|
10
|
+
-else
|
11
|
+
=uri
|
12
|
+
-if !uri.include?('*')
|
13
|
+
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['PNG', :png]].reverse.each do |format, format_code|
|
14
|
+
%span.format
|
15
|
+
%a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
|
data/views/layout.haml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
!!!
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%title Scrappy
|
5
|
+
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%body
|
7
|
+
#bar
|
8
|
+
-if request.fullpath!='/'
|
9
|
+
%ul.left
|
10
|
+
%li
|
11
|
+
%a{:href=>"#{settings.base_uri}/"}
|
12
|
+
%img{:src=>"#{settings.base_uri}/images/logo_tiny.png", :alt=>"Scrappy"}
|
13
|
+
%ul.right
|
14
|
+
%li
|
15
|
+
%a{:href=>"#{settings.base_uri}/kb"} Knowledge base
|
16
|
+
%li
|
17
|
+
%a{:href=>"#{settings.base_uri}/help"} Help
|
18
|
+
=yield
|
19
|
+
#footer
|
20
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
21
|
+
|
|
22
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-11 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -152,16 +152,16 @@ extensions: []
|
|
152
152
|
extra_rdoc_files:
|
153
153
|
- README.rdoc
|
154
154
|
- bin/scrappy
|
155
|
-
- lib/js/annotator.js
|
156
155
|
- lib/scrappy.rb
|
157
156
|
- lib/scrappy/agent/agent.rb
|
158
157
|
- lib/scrappy/agent/blind_agent.rb
|
159
158
|
- lib/scrappy/agent/cache.rb
|
160
159
|
- lib/scrappy/agent/dumper.rb
|
160
|
+
- lib/scrappy/agent/extractor.rb
|
161
161
|
- lib/scrappy/agent/formats.rb
|
162
162
|
- lib/scrappy/agent/map_reduce.rb
|
163
|
-
- lib/scrappy/agent/extractor.rb
|
164
163
|
- lib/scrappy/agent/visual_agent.rb
|
164
|
+
- lib/scrappy/repository.rb
|
165
165
|
- lib/scrappy/selectors/base_uri.rb
|
166
166
|
- lib/scrappy/selectors/css.rb
|
167
167
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -171,14 +171,9 @@ extra_rdoc_files:
|
|
171
171
|
- lib/scrappy/selectors/uri.rb
|
172
172
|
- lib/scrappy/selectors/uri_pattern.rb
|
173
173
|
- lib/scrappy/selectors/xpath.rb
|
174
|
-
- lib/scrappy/server/
|
174
|
+
- lib/scrappy/server/admin.rb
|
175
|
+
- lib/scrappy/server/helpers.rb
|
175
176
|
- lib/scrappy/server/server.rb
|
176
|
-
- lib/scrappy/server/public/images/logo.png
|
177
|
-
- lib/scrappy/server/public/images/logo_small.png
|
178
|
-
- lib/scrappy/server/public/stylesheets/application.css
|
179
|
-
- lib/scrappy/server/views/home.haml
|
180
|
-
- lib/scrappy/server/views/help.haml
|
181
|
-
- lib/scrappy/repository.rb
|
182
177
|
- lib/scrappy/shell.rb
|
183
178
|
- lib/scrappy/support.rb
|
184
179
|
- lib/scrappy/webkit/webkit.rb
|
@@ -189,16 +184,16 @@ files:
|
|
189
184
|
- Rakefile
|
190
185
|
- bin/scrappy
|
191
186
|
- kb/elmundo.yarf
|
192
|
-
- lib/js/annotator.js
|
193
187
|
- lib/scrappy.rb
|
194
188
|
- lib/scrappy/agent/agent.rb
|
195
189
|
- lib/scrappy/agent/blind_agent.rb
|
196
190
|
- lib/scrappy/agent/cache.rb
|
197
191
|
- lib/scrappy/agent/dumper.rb
|
192
|
+
- lib/scrappy/agent/extractor.rb
|
198
193
|
- lib/scrappy/agent/formats.rb
|
199
194
|
- lib/scrappy/agent/map_reduce.rb
|
200
|
-
- lib/scrappy/agent/extractor.rb
|
201
195
|
- lib/scrappy/agent/visual_agent.rb
|
196
|
+
- lib/scrappy/repository.rb
|
202
197
|
- lib/scrappy/selectors/base_uri.rb
|
203
198
|
- lib/scrappy/selectors/css.rb
|
204
199
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -208,19 +203,23 @@ files:
|
|
208
203
|
- lib/scrappy/selectors/uri.rb
|
209
204
|
- lib/scrappy/selectors/uri_pattern.rb
|
210
205
|
- lib/scrappy/selectors/xpath.rb
|
211
|
-
- lib/scrappy/server/
|
206
|
+
- lib/scrappy/server/admin.rb
|
207
|
+
- lib/scrappy/server/helpers.rb
|
212
208
|
- lib/scrappy/server/server.rb
|
213
|
-
- lib/scrappy/server/public/images/logo.png
|
214
|
-
- lib/scrappy/server/public/images/logo_small.png
|
215
|
-
- lib/scrappy/server/public/stylesheets/application.css
|
216
|
-
- lib/scrappy/server/views/home.haml
|
217
|
-
- lib/scrappy/server/views/help.haml
|
218
|
-
- lib/scrappy/repository.rb
|
219
209
|
- lib/scrappy/shell.rb
|
220
210
|
- lib/scrappy/support.rb
|
221
211
|
- lib/scrappy/webkit/webkit.rb
|
212
|
+
- public/favicon.ico
|
213
|
+
- public/images/logo.png
|
214
|
+
- public/images/logo_tiny.png
|
215
|
+
- public/javascripts/scrappy.js
|
216
|
+
- public/stylesheets/application.css
|
222
217
|
- test/test_helper.rb
|
223
218
|
- test/test_scrappy.rb
|
219
|
+
- views/help.haml
|
220
|
+
- views/home.haml
|
221
|
+
- views/kb.haml
|
222
|
+
- views/layout.haml
|
224
223
|
- scrappy.gemspec
|
225
224
|
has_rdoc: true
|
226
225
|
homepage: http://github.com/josei/scrappy
|
data/lib/scrappy/server/proxy.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'sinatra'
|
2
|
-
require 'thin'
|
3
|
-
|
4
|
-
module Scrappy
|
5
|
-
class Proxy < Sinatra::Base
|
6
|
-
get '*' do
|
7
|
-
process_request :get
|
8
|
-
end
|
9
|
-
|
10
|
-
post '*' do
|
11
|
-
process_request :post
|
12
|
-
end
|
13
|
-
|
14
|
-
protected
|
15
|
-
def process_request method
|
16
|
-
response = agent.proxy :method=>method, :uri=>request.env['REQUEST_URI'], :inputs=>params
|
17
|
-
|
18
|
-
case response.status
|
19
|
-
when :redirect
|
20
|
-
redirect response.uri
|
21
|
-
when :ok
|
22
|
-
headers 'Content-Type' => response.content_type
|
23
|
-
response.output
|
24
|
-
else
|
25
|
-
status 500
|
26
|
-
"Internal error"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def agent
|
31
|
-
Scrappy::Agent[request.ip]
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
Binary file
|
@@ -1,51 +0,0 @@
|
|
1
|
-
body {
|
2
|
-
font-family: Arial, sans;
|
3
|
-
}
|
4
|
-
#center {
|
5
|
-
text-align: center;
|
6
|
-
margin-top: 100px;
|
7
|
-
margin-bottom: 100px;
|
8
|
-
}
|
9
|
-
#search {
|
10
|
-
margin-top: 40px;
|
11
|
-
font-size:20px;
|
12
|
-
margin-bottom: 10px;
|
13
|
-
}
|
14
|
-
#search input {
|
15
|
-
width: 700px; height:30px; font-size:16px;
|
16
|
-
}
|
17
|
-
#buttons {
|
18
|
-
width: 400px;
|
19
|
-
margin: auto;
|
20
|
-
}
|
21
|
-
#buttons select {
|
22
|
-
width: 100px; height: 30px; font-size:16px;
|
23
|
-
margin-left: 5px;
|
24
|
-
}
|
25
|
-
#buttons button {
|
26
|
-
width: 100px; height: 30px; font-size:16px;
|
27
|
-
margin-right: 5px;
|
28
|
-
}
|
29
|
-
|
30
|
-
pre {
|
31
|
-
width: 600px;
|
32
|
-
margin-left: auto;
|
33
|
-
margin-right: auto;
|
34
|
-
border: 1px solid;
|
35
|
-
padding:10px;
|
36
|
-
}
|
37
|
-
#header {
|
38
|
-
margin: auto; width: 800px; padding: 15px;
|
39
|
-
border-bottom: 1px solid;
|
40
|
-
margin-top: 20px; font-size: 14px; color: #555;
|
41
|
-
}
|
42
|
-
#body {
|
43
|
-
margin: auto; width: 800px; padding: 15px;
|
44
|
-
font-size: 14px; color: #555;
|
45
|
-
}
|
46
|
-
#footer {
|
47
|
-
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
48
|
-
}
|
49
|
-
img {
|
50
|
-
border: none;
|
51
|
-
}
|
@@ -1,25 +0,0 @@
|
|
1
|
-
!!!
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%title Help - Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
-
%body
|
7
|
-
%div#header
|
8
|
-
%img{:src=>"#{settings.base_uri}/images/logo_small.png"}
|
9
|
-
%div#body
|
10
|
-
%h1 Help
|
11
|
-
%p
|
12
|
-
Scrappy web server is a web interface to Scrappy functionalities.
|
13
|
-
You can use it to get data from a web resource and integrate it with other system.
|
14
|
-
The service uses the following URL format:
|
15
|
-
%pre http://[host]/[format]/[url]
|
16
|
-
%p
|
17
|
-
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
18
|
-
%pre==#{settings.base_uri || "http://localhost:#{settings.port}"}/rdf/http://example.com/~user/%3Ftest%3D1
|
19
|
-
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
20
|
-
%div#footer
|
21
|
-
%a{:href=>"#{settings.base_uri}/"} Home
|
22
|
-
|
|
23
|
-
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
|
-
|
|
25
|
-
%a{:href=>'http://github.com/josei/scrappy'} About
|
@@ -1,26 +0,0 @@
|
|
1
|
-
!!!
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%title Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
-
%body
|
7
|
-
%div#center
|
8
|
-
%a{:href=>'http://github.com/josei/scrappy'}
|
9
|
-
%img{:src=>"#{settings.base_uri}/images/logo.png"}
|
10
|
-
%form
|
11
|
-
%div#search
|
12
|
-
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
13
|
-
%div#buttons
|
14
|
-
%button Scrape
|
15
|
-
%select{:name=>:format}
|
16
|
-
%option{:value=>:rdf} RDF
|
17
|
-
%option{:value=>:png} PNG
|
18
|
-
%option{:value=>:ejson} JSON
|
19
|
-
%option{:value=>:yarf} YARF
|
20
|
-
%option{:value=>:ntriples} nTriples
|
21
|
-
%div#footer
|
22
|
-
%a{:href=>"#{settings.base_uri}/"} Home
|
23
|
-
|
|
24
|
-
%a{:href=>"#{settings.base_uri}/help"} Help
|
25
|
-
|
|
26
|
-
%a{:href=>'http://github.com/josei/scrappy'} About
|