scrappy 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest +21 -14
- data/README.rdoc +5 -9
- data/Rakefile +1 -2
- data/bin/scrappy +141 -51
- data/lib/scrappy.rb +6 -9
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/extractor/extractor.rb +108 -0
- data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
- data/lib/scrappy/extractor/fragment.rb +111 -0
- data/lib/scrappy/extractor/selector.rb +41 -0
- data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
- data/lib/scrappy/extractor/selectors/css.rb +5 -0
- data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
- data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
- data/lib/scrappy/extractor/selectors/visual.rb +39 -0
- data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
- data/lib/scrappy/server/admin.rb +89 -2
- data/lib/scrappy/server/helpers.rb +11 -2
- data/lib/scrappy/server/server.rb +1 -0
- data/lib/scrappy/trainer/trainer.rb +101 -0
- data/public/javascripts/annotator.js +75 -0
- data/public/javascripts/remote.js +132 -0
- data/public/stylesheets/application.css +39 -12
- data/scrappy.gemspec +13 -11
- data/views/extractors.haml +24 -0
- data/views/layout.haml +14 -4
- data/views/patterns.haml +19 -0
- data/views/samples.haml +28 -0
- metadata +58 -56
- data/lib/scrappy/agent/extractor.rb +0 -196
- data/lib/scrappy/selectors/css.rb +0 -10
- data/public/javascripts/scrappy.js +0 -65
- data/views/kb.haml +0 -15
@@ -0,0 +1,75 @@
|
|
1
|
+
var add_visual_data = function() {
|
2
|
+
var items = document.documentElement.getElementsByTagName('*');
|
3
|
+
var i=0;
|
4
|
+
for(var i=0; i<items.length; i++) {
|
5
|
+
var item = items[i];
|
6
|
+
var x = 0;
|
7
|
+
var y = 0;
|
8
|
+
if (item.offsetParent) {
|
9
|
+
var obj = item;
|
10
|
+
do {
|
11
|
+
x += obj.offsetLeft;
|
12
|
+
y += obj.offsetTop;
|
13
|
+
} while (obj = obj.offsetParent);
|
14
|
+
}
|
15
|
+
item.setAttribute('vx', x);
|
16
|
+
item.setAttribute('vy', y);
|
17
|
+
item.setAttribute('vw', item.offsetWidth);
|
18
|
+
item.setAttribute('vh', item.offsetHeight);
|
19
|
+
var size = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size');
|
20
|
+
size = size.substring(0, size.length-2);
|
21
|
+
item.setAttribute('vsize', size);
|
22
|
+
var fonts = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-family').split(",");
|
23
|
+
var font = fonts[fonts.length-1].trim();
|
24
|
+
item.setAttribute('vfont', font);
|
25
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
26
|
+
if (weight == 'normal') weight = 400;
|
27
|
+
if (weight == 'bold') weight = 700;
|
28
|
+
item.setAttribute('vweight', weight);
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
|
33
|
+
jQuery(document).ready(function(){
|
34
|
+
var div;
|
35
|
+
if (window.scrappy_extractor) {
|
36
|
+
div = "<div id='scrappy_window' title='Scrappy'>" +
|
37
|
+
"<p>Extractor available for this URL</p>" +
|
38
|
+
"<p><a href='http://localhost:3434/rdf/"+escape(window.location)+"'>See output</a></p>" +
|
39
|
+
"<p><a class='sample' href='http://localhost:3434/samples'>Upload sample</a></p>" +
|
40
|
+
"</div>";
|
41
|
+
} else {
|
42
|
+
div = "<div id='scrappy_window' title='Scrappy'>" +
|
43
|
+
"<p>No extractor available for this URL</p>" +
|
44
|
+
"<p><a href='TODO'>Annotate page</a></p>" +
|
45
|
+
"<p><a class='extractor' href='http://localhost:3434/extractors'>Generate extractor</a></p>" +
|
46
|
+
"</div>";
|
47
|
+
}
|
48
|
+
|
49
|
+
$("body").append(div);
|
50
|
+
|
51
|
+
$('#scrappy_window .extractor, #scrappy_window .sample').live('click', function (e){
|
52
|
+
var link = $(this),
|
53
|
+
href = link.attr('href'),
|
54
|
+
html = $('<input name="html" type="hidden" />');
|
55
|
+
enc = $('<input name="encoding" type="hidden" />');
|
56
|
+
uri = $('<input name="uri" type="hidden" />');
|
57
|
+
form = $('<form method="post" action="'+href+'"></form>');
|
58
|
+
enc.attr('value', document.characterSet);
|
59
|
+
html.attr('value', document.documentElement.outerHTML);
|
60
|
+
uri.attr('value', window.location);
|
61
|
+
form.hide()
|
62
|
+
.append(html)
|
63
|
+
.append(enc)
|
64
|
+
.append(uri)
|
65
|
+
.appendTo('body');
|
66
|
+
e.preventDefault();
|
67
|
+
form.submit();
|
68
|
+
});
|
69
|
+
|
70
|
+
$("#scrappy_window").dialog();
|
71
|
+
});
|
72
|
+
|
73
|
+
add_visual_data();
|
74
|
+
|
75
|
+
window.scrappy_loaded = true
|
@@ -0,0 +1,132 @@
|
|
1
|
+
jQuery(function ($) {
|
2
|
+
var csrf_token = $('meta[name=csrf-token]').attr('content'),
|
3
|
+
csrf_param = $('meta[name=csrf-param]').attr('content');
|
4
|
+
|
5
|
+
$.fn.extend({
|
6
|
+
/**
|
7
|
+
* Triggers a custom event on an element and returns the event result
|
8
|
+
* this is used to get around not being able to ensure callbacks are placed
|
9
|
+
* at the end of the chain.
|
10
|
+
*
|
11
|
+
* TODO: deprecate with jQuery 1.4.2 release, in favor of subscribing to our
|
12
|
+
* own events and placing ourselves at the end of the chain.
|
13
|
+
*/
|
14
|
+
triggerAndReturn: function (name, data) {
|
15
|
+
var event = new $.Event(name);
|
16
|
+
this.trigger(event, data);
|
17
|
+
|
18
|
+
return event.result !== false;
|
19
|
+
},
|
20
|
+
|
21
|
+
/**
|
22
|
+
* Handles execution of remote calls firing overridable events along the way
|
23
|
+
*/
|
24
|
+
callRemote: function () {
|
25
|
+
var el = this,
|
26
|
+
method = el.attr('method') || el.attr('data-method') || 'GET',
|
27
|
+
url = el.attr('action') || el.attr('href'),
|
28
|
+
dataType = el.attr('data-type') || 'script';
|
29
|
+
|
30
|
+
if (url === undefined) {
|
31
|
+
throw "No URL specified for remote call (action or href must be present).";
|
32
|
+
} else {
|
33
|
+
if (el.triggerAndReturn('ajax:before')) {
|
34
|
+
var data = el.is('form') ? el.serializeArray() : [];
|
35
|
+
$.ajax({
|
36
|
+
url: url,
|
37
|
+
data: data,
|
38
|
+
dataType: dataType,
|
39
|
+
type: method.toUpperCase(),
|
40
|
+
beforeSend: function (xhr) {
|
41
|
+
el.trigger('ajax:loading', xhr);
|
42
|
+
},
|
43
|
+
success: function (data, status, xhr) {
|
44
|
+
el.trigger('ajax:success', [data, status, xhr]);
|
45
|
+
},
|
46
|
+
complete: function (xhr) {
|
47
|
+
el.trigger('ajax:complete', xhr);
|
48
|
+
},
|
49
|
+
error: function (xhr, status, error) {
|
50
|
+
el.trigger('ajax:failure', [xhr, status, error]);
|
51
|
+
}
|
52
|
+
});
|
53
|
+
}
|
54
|
+
|
55
|
+
el.trigger('ajax:after');
|
56
|
+
}
|
57
|
+
}
|
58
|
+
});
|
59
|
+
|
60
|
+
/**
|
61
|
+
* confirmation handler
|
62
|
+
*/
|
63
|
+
$('a[data-confirm],input[data-confirm]').live('click', function () {
|
64
|
+
var el = $(this);
|
65
|
+
if (el.triggerAndReturn('confirm')) {
|
66
|
+
if (!confirm(el.attr('data-confirm'))) {
|
67
|
+
return false;
|
68
|
+
}
|
69
|
+
}
|
70
|
+
});
|
71
|
+
|
72
|
+
|
73
|
+
/**
|
74
|
+
* remote handlers
|
75
|
+
*/
|
76
|
+
$('form[data-remote]').live('submit', function (e) {
|
77
|
+
$(this).callRemote();
|
78
|
+
e.preventDefault();
|
79
|
+
});
|
80
|
+
|
81
|
+
$('a[data-remote],input[data-remote]').live('click', function (e) {
|
82
|
+
$(this).callRemote();
|
83
|
+
e.preventDefault();
|
84
|
+
});
|
85
|
+
|
86
|
+
$('a[data-method]:not([data-remote])').live('click', function (e){
|
87
|
+
var link = $(this),
|
88
|
+
href = link.attr('href'),
|
89
|
+
method = link.attr('data-method'),
|
90
|
+
form = $('<form method="post" action="'+href+'"></form>'),
|
91
|
+
metadata_input = '<input name="_method" value="'+method+'" type="hidden" />';
|
92
|
+
|
93
|
+
if (csrf_param != null && csrf_token != null) {
|
94
|
+
metadata_input += '<input name="'+csrf_param+'" value="'+csrf_token+'" type="hidden" />';
|
95
|
+
}
|
96
|
+
|
97
|
+
form.hide()
|
98
|
+
.append(metadata_input)
|
99
|
+
.appendTo('body');
|
100
|
+
|
101
|
+
e.preventDefault();
|
102
|
+
form.submit();
|
103
|
+
});
|
104
|
+
|
105
|
+
/**
|
106
|
+
* disable-with handlers
|
107
|
+
*/
|
108
|
+
var disable_with_input_selector = 'input[data-disable-with]';
|
109
|
+
var disable_with_form_remote_selector = 'form[data-remote]:has(' + disable_with_input_selector + ')';
|
110
|
+
var disable_with_form_not_remote_selector = 'form:not([data-remote]):has(' + disable_with_input_selector + ')';
|
111
|
+
|
112
|
+
var disable_with_input_function = function () {
|
113
|
+
$(this).find(disable_with_input_selector).each(function () {
|
114
|
+
var input = $(this);
|
115
|
+
input.data('enable-with', input.val())
|
116
|
+
.attr('value', input.attr('data-disable-with'))
|
117
|
+
.attr('disabled', 'disabled');
|
118
|
+
});
|
119
|
+
};
|
120
|
+
|
121
|
+
$(disable_with_form_remote_selector).live('ajax:before', disable_with_input_function);
|
122
|
+
$(disable_with_form_not_remote_selector).live('submit', disable_with_input_function);
|
123
|
+
|
124
|
+
$(disable_with_form_remote_selector).live('ajax:complete', function () {
|
125
|
+
$(this).find(disable_with_input_selector).each(function () {
|
126
|
+
var input = $(this);
|
127
|
+
input.removeAttr('disabled')
|
128
|
+
.val(input.data('enable-with'));
|
129
|
+
});
|
130
|
+
});
|
131
|
+
|
132
|
+
});
|
@@ -18,7 +18,7 @@ a:hover, a:active {
|
|
18
18
|
text-decoration: underline;
|
19
19
|
}
|
20
20
|
h1, h2, h3, h4 {
|
21
|
-
color: #
|
21
|
+
color: #888;
|
22
22
|
}
|
23
23
|
h2 {
|
24
24
|
font-weight: normal;
|
@@ -73,14 +73,19 @@ img {
|
|
73
73
|
#bar ul.right {
|
74
74
|
float: right;
|
75
75
|
}
|
76
|
-
#
|
77
|
-
float:left;
|
76
|
+
#notice {
|
78
77
|
margin-top: 20px;
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
78
|
+
margin-left: auto;
|
79
|
+
margin-right: auto;
|
80
|
+
padding: 15px;
|
81
|
+
width: 800px;
|
82
|
+
color: #333;
|
83
|
+
font-size: 14px;
|
84
|
+
text-align: center;
|
85
|
+
background: -webkit-gradient(linear, left top, left bottom, from(#af9), to(#dfd));
|
86
|
+
background: -moz-linear-gradient(top, #af9, #dfd);
|
87
|
+
border-radius: 10px;
|
88
|
+
-moz-border-radius: 10px;
|
84
89
|
}
|
85
90
|
#body {
|
86
91
|
margin: auto; width: 800px; padding: 15px;
|
@@ -112,7 +117,7 @@ img {
|
|
112
117
|
margin-right: 5px;
|
113
118
|
}
|
114
119
|
#footer {
|
115
|
-
margin-top:30px; text-align: center; font-size:
|
120
|
+
margin-top:30px; text-align: center; font-size:12px; color: #555;
|
116
121
|
height: 50px;
|
117
122
|
}
|
118
123
|
|
@@ -142,6 +147,7 @@ ul.detail {
|
|
142
147
|
list-style-type: none;
|
143
148
|
}
|
144
149
|
ul.detail li {
|
150
|
+
font-size: 12px;
|
145
151
|
padding: 6px;
|
146
152
|
background-color: #eee;
|
147
153
|
margin: 1px;
|
@@ -149,19 +155,28 @@ ul.detail li {
|
|
149
155
|
ul.detail li span {
|
150
156
|
display: inline-block;
|
151
157
|
}
|
152
|
-
ul.detail li span.
|
158
|
+
ul.detail li span.action {
|
159
|
+
font-size: 10px;
|
160
|
+
width: 10px;
|
161
|
+
}
|
162
|
+
ul.detail li span.name, ul.detail li span.short_name {
|
153
163
|
width: 550px;
|
154
164
|
overflow-x: hidden;
|
165
|
+
white-space: nowrap;
|
155
166
|
font-family: monospace;
|
156
|
-
|
167
|
+
}
|
168
|
+
ul.detail li span.short_name {
|
169
|
+
width: 420px;
|
157
170
|
}
|
158
171
|
ul.detail li span.format {
|
159
172
|
float: right;
|
160
|
-
font-size: 12px;
|
161
173
|
font-weight: bold;
|
162
174
|
margin-left: 10px;
|
163
175
|
text-align: center;
|
164
176
|
}
|
177
|
+
ul.detail li span.date {
|
178
|
+
float: right;
|
179
|
+
}
|
165
180
|
|
166
181
|
ul.detail li span.format a:hover,
|
167
182
|
ul.detail li span.format a:link,
|
@@ -169,3 +184,15 @@ ul.detail li span.format a:visited,
|
|
169
184
|
ul.detail li span.format a:active {
|
170
185
|
color: #900;
|
171
186
|
}
|
187
|
+
ul.detail li span.action a:hover,
|
188
|
+
ul.detail li span.action a:link,
|
189
|
+
ul.detail li span.action a:visited,
|
190
|
+
ul.detail li span.action a:active {
|
191
|
+
color: #900;
|
192
|
+
}
|
193
|
+
ul.detail li span.date a:hover,
|
194
|
+
ul.detail li span.date a:link,
|
195
|
+
ul.detail li span.date a:visited,
|
196
|
+
ul.detail li span.date a:active {
|
197
|
+
color: #888;
|
198
|
+
}
|
data/scrappy.gemspec
CHANGED
@@ -2,50 +2,51 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-18}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/extractor/extractor.rb", "lib/scrappy/extractor/formats.rb", "lib/scrappy/extractor/fragment.rb", "lib/scrappy/extractor/selector.rb", "lib/scrappy/extractor/selectors/base_uri.rb", "lib/scrappy/extractor/selectors/css.rb", "lib/scrappy/extractor/selectors/new_uri.rb", "lib/scrappy/extractor/selectors/root.rb", "lib/scrappy/extractor/selectors/section.rb", "lib/scrappy/extractor/selectors/slice.rb", "lib/scrappy/extractor/selectors/uri.rb", "lib/scrappy/extractor/selectors/uri_pattern.rb", "lib/scrappy/extractor/selectors/visual.rb", "lib/scrappy/extractor/selectors/xpath.rb", "lib/scrappy/repository.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "lib/scrappy/trainer/trainer.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/annotator.js", "public/javascripts/remote.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/extractors.haml", "views/help.haml", "views/home.haml", "views/layout.haml", "views/patterns.haml", "views/samples.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
|
-
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
17
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
19
18
|
s.require_paths = ["lib"]
|
20
19
|
s.rubyforge_project = %q{scrappy}
|
21
|
-
s.rubygems_version = %q{1.3.
|
20
|
+
s.rubygems_version = %q{1.3.6}
|
22
21
|
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
23
|
-
s.test_files = ["test/
|
22
|
+
s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
|
24
23
|
|
25
24
|
if s.respond_to? :specification_version then
|
26
25
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
27
26
|
s.specification_version = 3
|
28
27
|
|
29
|
-
if Gem::Version.new(Gem::
|
28
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
30
29
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
|
31
30
|
s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
|
32
31
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
33
32
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
33
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.
|
34
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.3.0"])
|
36
35
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
36
|
s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
|
38
37
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
38
|
+
s.add_runtime_dependency(%q<rack-flash>, [">= 0.1.1"])
|
39
39
|
else
|
40
40
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
41
41
|
s.add_dependency(%q<sinatra>, [">= 1.1.2"])
|
42
42
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
43
43
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
44
44
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
45
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
45
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.0"])
|
46
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
47
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
48
48
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
49
|
+
s.add_dependency(%q<rack-flash>, [">= 0.1.1"])
|
49
50
|
end
|
50
51
|
else
|
51
52
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
@@ -53,9 +54,10 @@ Gem::Specification.new do |s|
|
|
53
54
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
54
55
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
55
56
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
56
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
57
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.0"])
|
57
58
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
58
59
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
59
60
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
61
|
+
s.add_dependency(%q<rack-flash>, [">= 0.1.1"])
|
60
62
|
end
|
61
63
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Extractors
|
3
|
+
%p
|
4
|
+
Extractors are mappings between HTML pages and RDF data. They are used to extract RDF data from plain web pages.
|
5
|
+
%p
|
6
|
+
-if @uris.empty?
|
7
|
+
Currently, there are no extractors.
|
8
|
+
-else
|
9
|
+
%ul.detail
|
10
|
+
-@uris.each do |uri|
|
11
|
+
%li
|
12
|
+
-if Scrappy::App.editable_kb?
|
13
|
+
%span.action
|
14
|
+
%a{:href=>"#{settings.base_uri}/extractors/#{CGI::escape(uri)}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the extractor for #{uri}?"}
|
15
|
+
X
|
16
|
+
%span.name
|
17
|
+
-if !uri.include?('*')
|
18
|
+
%a{:href=>uri}=uri
|
19
|
+
-else
|
20
|
+
=uri
|
21
|
+
-if !uri.include?('*')
|
22
|
+
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['nTriples', :ntriples], ['PNG', :png]].reverse.each do |format, format_code|
|
23
|
+
%span.format
|
24
|
+
%a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
|
data/views/layout.haml
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
%head
|
4
4
|
%title Scrappy
|
5
5
|
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%script{:src=>"https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"}
|
7
|
+
%script{:src=>"#{settings.base_uri}/javascripts/remote.js"}
|
6
8
|
%body
|
7
9
|
#bar
|
8
10
|
-if request.fullpath!='/'
|
@@ -12,11 +14,19 @@
|
|
12
14
|
%img{:src=>"#{settings.base_uri}/images/logo_tiny.png", :alt=>"Scrappy"}
|
13
15
|
%ul.right
|
14
16
|
%li
|
15
|
-
%a{:href=>"#{settings.base_uri}/
|
17
|
+
%a{:href=>"#{settings.base_uri}/extractors"} Extractors
|
18
|
+
%li
|
19
|
+
%a{:href=>"#{settings.base_uri}/patterns"} Patterns
|
20
|
+
%li
|
21
|
+
%a{:href=>"#{settings.base_uri}/samples"} Samples
|
16
22
|
%li
|
17
23
|
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
|
+
-if flash[:notice]
|
25
|
+
#notice=flash[:notice]
|
18
26
|
=yield
|
19
27
|
#footer
|
20
|
-
%
|
21
|
-
|
22
|
-
|
28
|
+
%p
|
29
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
30
|
+
|
|
31
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
32
|
+
%p==Scrappy v#{Scrappy::VERSION}
|
data/views/patterns.haml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Patterns
|
3
|
+
%p
|
4
|
+
Patterns are visual conditions that are used to identify data in sites which do not have a defined extractor.
|
5
|
+
%p
|
6
|
+
-if @uris.empty?
|
7
|
+
Currently, there are no patterns.
|
8
|
+
-else
|
9
|
+
%ul.detail
|
10
|
+
-@uris.each do |uri|
|
11
|
+
%li
|
12
|
+
%span.action
|
13
|
+
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(uri)}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the pattern for #{uri}?"}
|
14
|
+
X
|
15
|
+
%span.name
|
16
|
+
-if !uri.include?('*')
|
17
|
+
%a{:href=>uri}=uri
|
18
|
+
-else
|
19
|
+
=uri
|