scrappy 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest +21 -14
- data/README.rdoc +5 -9
- data/Rakefile +1 -2
- data/bin/scrappy +141 -51
- data/lib/scrappy.rb +6 -9
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/extractor/extractor.rb +108 -0
- data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
- data/lib/scrappy/extractor/fragment.rb +111 -0
- data/lib/scrappy/extractor/selector.rb +41 -0
- data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
- data/lib/scrappy/extractor/selectors/css.rb +5 -0
- data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
- data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
- data/lib/scrappy/extractor/selectors/visual.rb +39 -0
- data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
- data/lib/scrappy/server/admin.rb +89 -2
- data/lib/scrappy/server/helpers.rb +11 -2
- data/lib/scrappy/server/server.rb +1 -0
- data/lib/scrappy/trainer/trainer.rb +101 -0
- data/public/javascripts/annotator.js +75 -0
- data/public/javascripts/remote.js +132 -0
- data/public/stylesheets/application.css +39 -12
- data/scrappy.gemspec +13 -11
- data/views/extractors.haml +24 -0
- data/views/layout.haml +14 -4
- data/views/patterns.haml +19 -0
- data/views/samples.haml +28 -0
- metadata +58 -56
- data/lib/scrappy/agent/extractor.rb +0 -196
- data/lib/scrappy/selectors/css.rb +0 -10
- data/public/javascripts/scrappy.js +0 -65
- data/views/kb.haml +0 -15
@@ -0,0 +1,75 @@
|
|
1
|
+
var add_visual_data = function() {
|
2
|
+
var items = document.documentElement.getElementsByTagName('*');
|
3
|
+
var i=0;
|
4
|
+
for(var i=0; i<items.length; i++) {
|
5
|
+
var item = items[i];
|
6
|
+
var x = 0;
|
7
|
+
var y = 0;
|
8
|
+
if (item.offsetParent) {
|
9
|
+
var obj = item;
|
10
|
+
do {
|
11
|
+
x += obj.offsetLeft;
|
12
|
+
y += obj.offsetTop;
|
13
|
+
} while (obj = obj.offsetParent);
|
14
|
+
}
|
15
|
+
item.setAttribute('vx', x);
|
16
|
+
item.setAttribute('vy', y);
|
17
|
+
item.setAttribute('vw', item.offsetWidth);
|
18
|
+
item.setAttribute('vh', item.offsetHeight);
|
19
|
+
var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
|
20
|
+
size = size.substring(0, size.length-2);
|
21
|
+
item.setAttribute('vsize', size);
|
22
|
+
var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
|
23
|
+
var font = fonts[fonts.length-1].trim();
|
24
|
+
item.setAttribute('vfont', font);
|
25
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
26
|
+
if (weight == 'normal') weight = 400;
|
27
|
+
if (weight == 'bold') weight = 700;
|
28
|
+
item.setAttribute('vweight', weight);
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
|
33
|
+
jQuery(document).ready(function(){
|
34
|
+
var div;
|
35
|
+
if (window.scrappy_extractor) {
|
36
|
+
div = "<div id='scrappy_window' title='Scrappy'>" +
|
37
|
+
"<p>Extractor available for this URL</p>" +
|
38
|
+
"<p><a href='http://localhost:3434/rdf/"+escape(window.location)+"'>See output</a></p>" +
|
39
|
+
"<p><a class='sample' href='http://localhost:3434/samples'>Upload sample</a></p>" +
|
40
|
+
"</div>";
|
41
|
+
} else {
|
42
|
+
div = "<div id='scrappy_window' title='Scrappy'>" +
|
43
|
+
"<p>No extractor available for this URL</p>" +
|
44
|
+
"<p><a href='TODO'>Annotate page</a></p>" +
|
45
|
+
"<p><a class='extractor' href='http://localhost:3434/extractors'>Generate extractor</a></p>" +
|
46
|
+
"</div>";
|
47
|
+
}
|
48
|
+
|
49
|
+
$("body").append(div);
|
50
|
+
|
51
|
+
$('#scrappy_window .extractor, #scrappy_window .sample').live('click', function (e){
|
52
|
+
var link = $(this),
|
53
|
+
href = link.attr('href'),
|
54
|
+
html = $('<input name="html" type="hidden" />');
|
55
|
+
enc = $('<input name="encoding" type="hidden" />');
|
56
|
+
uri = $('<input name="uri" type="hidden" />');
|
57
|
+
form = $('<form method="post" action="'+href+'"></form>');
|
58
|
+
enc.attr('value', document.characterSet);
|
59
|
+
html.attr('value', document.documentElement.outerHTML);
|
60
|
+
uri.attr('value', window.location);
|
61
|
+
form.hide()
|
62
|
+
.append(html)
|
63
|
+
.append(enc)
|
64
|
+
.append(uri)
|
65
|
+
.appendTo('body');
|
66
|
+
e.preventDefault();
|
67
|
+
form.submit();
|
68
|
+
});
|
69
|
+
|
70
|
+
$("#scrappy_window").dialog();
|
71
|
+
});
|
72
|
+
|
73
|
+
add_visual_data();
|
74
|
+
|
75
|
+
window.scrappy_loaded = true
|
@@ -0,0 +1,132 @@
|
|
1
|
+
jQuery(function ($) {
|
2
|
+
var csrf_token = $('meta[name=csrf-token]').attr('content'),
|
3
|
+
csrf_param = $('meta[name=csrf-param]').attr('content');
|
4
|
+
|
5
|
+
$.fn.extend({
|
6
|
+
/**
|
7
|
+
* Triggers a custom event on an element and returns the event result
|
8
|
+
* this is used to get around not being able to ensure callbacks are placed
|
9
|
+
* at the end of the chain.
|
10
|
+
*
|
11
|
+
* TODO: deprecate with jQuery 1.4.2 release, in favor of subscribing to our
|
12
|
+
* own events and placing ourselves at the end of the chain.
|
13
|
+
*/
|
14
|
+
triggerAndReturn: function (name, data) {
|
15
|
+
var event = new $.Event(name);
|
16
|
+
this.trigger(event, data);
|
17
|
+
|
18
|
+
return event.result !== false;
|
19
|
+
},
|
20
|
+
|
21
|
+
/**
|
22
|
+
* Handles execution of remote calls firing overridable events along the way
|
23
|
+
*/
|
24
|
+
callRemote: function () {
|
25
|
+
var el = this,
|
26
|
+
method = el.attr('method') || el.attr('data-method') || 'GET',
|
27
|
+
url = el.attr('action') || el.attr('href'),
|
28
|
+
dataType = el.attr('data-type') || 'script';
|
29
|
+
|
30
|
+
if (url === undefined) {
|
31
|
+
throw "No URL specified for remote call (action or href must be present).";
|
32
|
+
} else {
|
33
|
+
if (el.triggerAndReturn('ajax:before')) {
|
34
|
+
var data = el.is('form') ? el.serializeArray() : [];
|
35
|
+
$.ajax({
|
36
|
+
url: url,
|
37
|
+
data: data,
|
38
|
+
dataType: dataType,
|
39
|
+
type: method.toUpperCase(),
|
40
|
+
beforeSend: function (xhr) {
|
41
|
+
el.trigger('ajax:loading', xhr);
|
42
|
+
},
|
43
|
+
success: function (data, status, xhr) {
|
44
|
+
el.trigger('ajax:success', [data, status, xhr]);
|
45
|
+
},
|
46
|
+
complete: function (xhr) {
|
47
|
+
el.trigger('ajax:complete', xhr);
|
48
|
+
},
|
49
|
+
error: function (xhr, status, error) {
|
50
|
+
el.trigger('ajax:failure', [xhr, status, error]);
|
51
|
+
}
|
52
|
+
});
|
53
|
+
}
|
54
|
+
|
55
|
+
el.trigger('ajax:after');
|
56
|
+
}
|
57
|
+
}
|
58
|
+
});
|
59
|
+
|
60
|
+
/**
|
61
|
+
* confirmation handler
|
62
|
+
*/
|
63
|
+
$('a[data-confirm],input[data-confirm]').live('click', function () {
|
64
|
+
var el = $(this);
|
65
|
+
if (el.triggerAndReturn('confirm')) {
|
66
|
+
if (!confirm(el.attr('data-confirm'))) {
|
67
|
+
return false;
|
68
|
+
}
|
69
|
+
}
|
70
|
+
});
|
71
|
+
|
72
|
+
|
73
|
+
/**
|
74
|
+
* remote handlers
|
75
|
+
*/
|
76
|
+
$('form[data-remote]').live('submit', function (e) {
|
77
|
+
$(this).callRemote();
|
78
|
+
e.preventDefault();
|
79
|
+
});
|
80
|
+
|
81
|
+
$('a[data-remote],input[data-remote]').live('click', function (e) {
|
82
|
+
$(this).callRemote();
|
83
|
+
e.preventDefault();
|
84
|
+
});
|
85
|
+
|
86
|
+
$('a[data-method]:not([data-remote])').live('click', function (e){
|
87
|
+
var link = $(this),
|
88
|
+
href = link.attr('href'),
|
89
|
+
method = link.attr('data-method'),
|
90
|
+
form = $('<form method="post" action="'+href+'"></form>'),
|
91
|
+
metadata_input = '<input name="_method" value="'+method+'" type="hidden" />';
|
92
|
+
|
93
|
+
if (csrf_param != null && csrf_token != null) {
|
94
|
+
metadata_input += '<input name="'+csrf_param+'" value="'+csrf_token+'" type="hidden" />';
|
95
|
+
}
|
96
|
+
|
97
|
+
form.hide()
|
98
|
+
.append(metadata_input)
|
99
|
+
.appendTo('body');
|
100
|
+
|
101
|
+
e.preventDefault();
|
102
|
+
form.submit();
|
103
|
+
});
|
104
|
+
|
105
|
+
/**
|
106
|
+
* disable-with handlers
|
107
|
+
*/
|
108
|
+
var disable_with_input_selector = 'input[data-disable-with]';
|
109
|
+
var disable_with_form_remote_selector = 'form[data-remote]:has(' + disable_with_input_selector + ')';
|
110
|
+
var disable_with_form_not_remote_selector = 'form:not([data-remote]):has(' + disable_with_input_selector + ')';
|
111
|
+
|
112
|
+
var disable_with_input_function = function () {
|
113
|
+
$(this).find(disable_with_input_selector).each(function () {
|
114
|
+
var input = $(this);
|
115
|
+
input.data('enable-with', input.val())
|
116
|
+
.attr('value', input.attr('data-disable-with'))
|
117
|
+
.attr('disabled', 'disabled');
|
118
|
+
});
|
119
|
+
};
|
120
|
+
|
121
|
+
$(disable_with_form_remote_selector).live('ajax:before', disable_with_input_function);
|
122
|
+
$(disable_with_form_not_remote_selector).live('submit', disable_with_input_function);
|
123
|
+
|
124
|
+
$(disable_with_form_remote_selector).live('ajax:complete', function () {
|
125
|
+
$(this).find(disable_with_input_selector).each(function () {
|
126
|
+
var input = $(this);
|
127
|
+
input.removeAttr('disabled')
|
128
|
+
.val(input.data('enable-with'));
|
129
|
+
});
|
130
|
+
});
|
131
|
+
|
132
|
+
});
|
@@ -18,7 +18,7 @@ a:hover, a:active {
|
|
18
18
|
text-decoration: underline;
|
19
19
|
}
|
20
20
|
h1, h2, h3, h4 {
|
21
|
-
color: #
|
21
|
+
color: #888;
|
22
22
|
}
|
23
23
|
h2 {
|
24
24
|
font-weight: normal;
|
@@ -73,14 +73,19 @@ img {
|
|
73
73
|
#bar ul.right {
|
74
74
|
float: right;
|
75
75
|
}
|
76
|
-
#
|
77
|
-
float:left;
|
76
|
+
#notice {
|
78
77
|
margin-top: 20px;
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
78
|
+
margin-left: auto;
|
79
|
+
margin-right: auto;
|
80
|
+
padding: 15px;
|
81
|
+
width: 800px;
|
82
|
+
color: #333;
|
83
|
+
font-size: 14px;
|
84
|
+
text-align: center;
|
85
|
+
background: -webkit-gradient(linear, left top, left bottom, from(#af9), to(#dfd));
|
86
|
+
background: -moz-linear-gradient(top, #af9, #dfd);
|
87
|
+
border-radius: 10px;
|
88
|
+
-moz-border-radius: 10px;
|
84
89
|
}
|
85
90
|
#body {
|
86
91
|
margin: auto; width: 800px; padding: 15px;
|
@@ -112,7 +117,7 @@ img {
|
|
112
117
|
margin-right: 5px;
|
113
118
|
}
|
114
119
|
#footer {
|
115
|
-
margin-top:30px; text-align: center; font-size:
|
120
|
+
margin-top:30px; text-align: center; font-size:12px; color: #555;
|
116
121
|
height: 50px;
|
117
122
|
}
|
118
123
|
|
@@ -142,6 +147,7 @@ ul.detail {
|
|
142
147
|
list-style-type: none;
|
143
148
|
}
|
144
149
|
ul.detail li {
|
150
|
+
font-size: 12px;
|
145
151
|
padding: 6px;
|
146
152
|
background-color: #eee;
|
147
153
|
margin: 1px;
|
@@ -149,19 +155,28 @@ ul.detail li {
|
|
149
155
|
ul.detail li span {
|
150
156
|
display: inline-block;
|
151
157
|
}
|
152
|
-
ul.detail li span.
|
158
|
+
ul.detail li span.action {
|
159
|
+
font-size: 10px;
|
160
|
+
width: 10px;
|
161
|
+
}
|
162
|
+
ul.detail li span.name, ul.detail li span.short_name {
|
153
163
|
width: 550px;
|
154
164
|
overflow-x: hidden;
|
165
|
+
white-space: nowrap;
|
155
166
|
font-family: monospace;
|
156
|
-
|
167
|
+
}
|
168
|
+
ul.detail li span.short_name {
|
169
|
+
width: 420px;
|
157
170
|
}
|
158
171
|
ul.detail li span.format {
|
159
172
|
float: right;
|
160
|
-
font-size: 12px;
|
161
173
|
font-weight: bold;
|
162
174
|
margin-left: 10px;
|
163
175
|
text-align: center;
|
164
176
|
}
|
177
|
+
ul.detail li span.date {
|
178
|
+
float: right;
|
179
|
+
}
|
165
180
|
|
166
181
|
ul.detail li span.format a:hover,
|
167
182
|
ul.detail li span.format a:link,
|
@@ -169,3 +184,15 @@ ul.detail li span.format a:visited,
|
|
169
184
|
ul.detail li span.format a:active {
|
170
185
|
color: #900;
|
171
186
|
}
|
187
|
+
ul.detail li span.action a:hover,
|
188
|
+
ul.detail li span.action a:link,
|
189
|
+
ul.detail li span.action a:visited,
|
190
|
+
ul.detail li span.action a:active {
|
191
|
+
color: #900;
|
192
|
+
}
|
193
|
+
ul.detail li span.date a:hover,
|
194
|
+
ul.detail li span.date a:link,
|
195
|
+
ul.detail li span.date a:visited,
|
196
|
+
ul.detail li span.date a:active {
|
197
|
+
color: #888;
|
198
|
+
}
|
data/scrappy.gemspec
CHANGED
@@ -2,50 +2,51 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-18}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
|
-
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
17
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
19
18
|
s.require_paths = ["lib"]
|
20
19
|
s.rubyforge_project = %q{scrappy}
|
21
|
-
s.rubygems_version = %q{1.3.
|
20
|
+
s.rubygems_version = %q{1.3.6}
|
22
21
|
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
23
|
-
s.test_files = ["test/
|
22
|
+
s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
|
24
23
|
|
25
24
|
if s.respond_to? :specification_version then
|
26
25
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
27
26
|
s.specification_version = 3
|
28
27
|
|
29
|
-
if Gem::Version.new(Gem::
|
28
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
30
29
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
|
31
30
|
s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
|
32
31
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
33
32
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
33
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.
|
34
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.3.0"])
|
36
35
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
36
|
s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
|
38
37
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
38
|
+
s.add_runtime_dependency(%q<rack-flash>, [">= 0.1.1"])
|
39
39
|
else
|
40
40
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
41
41
|
s.add_dependency(%q<sinatra>, [">= 1.1.2"])
|
42
42
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
43
43
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
44
44
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
45
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
45
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.0"])
|
46
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
47
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
48
48
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
49
|
+
s.add_dependency(%q<rack-flash>, [">= 0.1.1"])
|
49
50
|
end
|
50
51
|
else
|
51
52
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
@@ -53,9 +54,10 @@ Gem::Specification.new do |s|
|
|
53
54
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
54
55
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
55
56
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
56
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
57
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.0"])
|
57
58
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
58
59
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
59
60
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
61
|
+
s.add_dependency(%q<rack-flash>, [">= 0.1.1"])
|
60
62
|
end
|
61
63
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Extractors
|
3
|
+
%p
|
4
|
+
Extractors are mappings between HTML pages and RDF data. They are used to extract RDF data from plain web pages.
|
5
|
+
%p
|
6
|
+
-if @uris.empty?
|
7
|
+
Currently, there are no extractors.
|
8
|
+
-else
|
9
|
+
%ul.detail
|
10
|
+
-@uris.each do |uri|
|
11
|
+
%li
|
12
|
+
-if Scrappy::App.editable_kb?
|
13
|
+
%span.action
|
14
|
+
%a{:href=>"#{settings.base_uri}/extractors/#{CGI::escape(uri)}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the extractor for #{uri}?"}
|
15
|
+
X
|
16
|
+
%span.name
|
17
|
+
-if !uri.include?('*')
|
18
|
+
%a{:href=>uri}=uri
|
19
|
+
-else
|
20
|
+
=uri
|
21
|
+
-if !uri.include?('*')
|
22
|
+
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['nTriples', :ntriples], ['PNG', :png]].reverse.each do |format, format_code|
|
23
|
+
%span.format
|
24
|
+
%a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
|
data/views/layout.haml
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
%head
|
4
4
|
%title Scrappy
|
5
5
|
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%script{:src=>"https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"}
|
7
|
+
%script{:src=>"#{settings.base_uri}/javascripts/remote.js"}
|
6
8
|
%body
|
7
9
|
#bar
|
8
10
|
-if request.fullpath!='/'
|
@@ -12,11 +14,19 @@
|
|
12
14
|
%img{:src=>"#{settings.base_uri}/images/logo_tiny.png", :alt=>"Scrappy"}
|
13
15
|
%ul.right
|
14
16
|
%li
|
15
|
-
%a{:href=>"#{settings.base_uri}/
|
17
|
+
%a{:href=>"#{settings.base_uri}/extractors"} Extractors
|
18
|
+
%li
|
19
|
+
%a{:href=>"#{settings.base_uri}/patterns"} Patterns
|
20
|
+
%li
|
21
|
+
%a{:href=>"#{settings.base_uri}/samples"} Samples
|
16
22
|
%li
|
17
23
|
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
|
+
-if flash[:notice]
|
25
|
+
#notice=flash[:notice]
|
18
26
|
=yield
|
19
27
|
#footer
|
20
|
-
%
|
21
|
-
|
22
|
-
|
28
|
+
%p
|
29
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
30
|
+
|
|
31
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
32
|
+
%p==Scrappy v#{Scrappy::VERSION}
|
data/views/patterns.haml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Patterns
|
3
|
+
%p
|
4
|
+
Patterns are visual conditions that are used to identify data in sites which do not have a defined extractor.
|
5
|
+
%p
|
6
|
+
-if @uris.empty?
|
7
|
+
Currently, there are no patterns.
|
8
|
+
-else
|
9
|
+
%ul.detail
|
10
|
+
-@uris.each do |uri|
|
11
|
+
%li
|
12
|
+
%span.action
|
13
|
+
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(uri)}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the pattern for #{uri}?"}
|
14
|
+
X
|
15
|
+
%span.name
|
16
|
+
-if !uri.include?('*')
|
17
|
+
%a{:href=>uri}=uri
|
18
|
+
-else
|
19
|
+
=uri
|