scrappy 0.3.5 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/Rakefile +1 -1
- data/bin/scrappy +75 -17
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/extractor/extractor.rb +11 -8
- data/lib/scrappy/extractor/fragment.rb +1 -1
- data/lib/scrappy/extractor/selector.rb +6 -2
- data/lib/scrappy/extractor/selectors/uri_pattern.rb +1 -1
- data/lib/scrappy/extractor/selectors/visual.rb +66 -52
- data/lib/scrappy/learning/optimizer.rb +355 -107
- data/lib/scrappy/learning/trainer.rb +112 -40
- data/lib/scrappy/server/admin.rb +180 -17
- data/lib/scrappy/support.rb +0 -24
- data/public/javascripts/annotator.js +1 -1
- data/public/stylesheets/application.css +33 -0
- data/scrappy.gemspec +5 -5
- data/views/help.haml +1 -2
- data/views/layout.haml +1 -0
- data/views/patterns.haml +10 -5
- data/views/samples.haml +46 -22
- metadata +6 -6
@@ -53,8 +53,8 @@ jQuery(document).ready(function(){
|
|
53
53
|
} else {
|
54
54
|
div = "<div id='scrappy_window' title='Scrappy'>" +
|
55
55
|
"<p>No extractor available for this URL</p>" +
|
56
|
-
"<p><a href='TODO'>Annotate page</a></p>" +
|
57
56
|
"<p><a class='extractor' href='http://localhost:3434/extractors'>Generate extractor</a></p>" +
|
57
|
+
"<p><a class='sample' href='http://localhost:3434/samples'>Upload sample</a></p>" +
|
58
58
|
"</div>";
|
59
59
|
}
|
60
60
|
|
@@ -9,6 +9,17 @@ pre {
|
|
9
9
|
border: 1px solid;
|
10
10
|
padding: 10px;
|
11
11
|
}
|
12
|
+
pre.wide {
|
13
|
+
width: 770px;
|
14
|
+
max-height: 900px;
|
15
|
+
margin-left: auto;
|
16
|
+
margin-right: auto;
|
17
|
+
border: 1px solid;
|
18
|
+
font-size: 12px;
|
19
|
+
overflow: scroll;
|
20
|
+
padding: 10px;
|
21
|
+
font-family: monospace;
|
22
|
+
}
|
12
23
|
a:link, a:visited {
|
13
24
|
color: #33f;
|
14
25
|
text-decoration: none;
|
@@ -152,6 +163,9 @@ ul.detail li {
|
|
152
163
|
background-color: #eee;
|
153
164
|
margin: 1px;
|
154
165
|
}
|
166
|
+
ul.detail li.special {
|
167
|
+
background-color: #fff;
|
168
|
+
}
|
155
169
|
ul.detail li span {
|
156
170
|
display: inline-block;
|
157
171
|
}
|
@@ -174,6 +188,9 @@ ul.detail li span.format {
|
|
174
188
|
margin-left: 10px;
|
175
189
|
text-align: center;
|
176
190
|
}
|
191
|
+
ul.detail li.special span.format {
|
192
|
+
font-weight: normal;
|
193
|
+
}
|
177
194
|
ul.detail li span.date {
|
178
195
|
float: right;
|
179
196
|
}
|
@@ -196,3 +213,19 @@ ul.detail li span.date a:visited,
|
|
196
213
|
ul.detail li span.date a:active {
|
197
214
|
color: #888;
|
198
215
|
}
|
216
|
+
|
217
|
+
fieldset {
|
218
|
+
border: 0;
|
219
|
+
padding: 0;
|
220
|
+
margin: 0;
|
221
|
+
vertical-align: baseline;
|
222
|
+
}
|
223
|
+
|
224
|
+
textarea {
|
225
|
+
width: 100%;
|
226
|
+
}
|
227
|
+
|
228
|
+
span.type {
|
229
|
+
float: left;
|
230
|
+
width: 80px;
|
231
|
+
}
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.4.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-06-30}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -31,7 +31,7 @@ Gem::Specification.new do |s|
|
|
31
31
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
32
32
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
33
33
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
34
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.3.
|
34
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.3.9"])
|
35
35
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
36
36
|
s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
|
37
37
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
@@ -42,7 +42,7 @@ Gem::Specification.new do |s|
|
|
42
42
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
43
43
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
44
44
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
45
|
-
s.add_dependency(%q<lightrdf>, [">= 0.3.
|
45
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.9"])
|
46
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
47
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
48
48
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
@@ -54,7 +54,7 @@ Gem::Specification.new do |s|
|
|
54
54
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
55
55
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
56
56
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
57
|
-
s.add_dependency(%q<lightrdf>, [">= 0.3.
|
57
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.9"])
|
58
58
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
59
59
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
60
60
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
data/views/help.haml
CHANGED
@@ -16,5 +16,4 @@
|
|
16
16
|
Drag this to your bookmarks:
|
17
17
|
%a.bookmark{:href=>bookmark_js, :onclick=>drag_js} Scrappy
|
18
18
|
%p
|
19
|
-
Then visit the web page you want to build a extractor for.
|
20
|
-
Click on your "Scrappy" bookmark and annotate the web page.
|
19
|
+
Then visit the web page you want to build a extractor for and click on your "Scrappy" bookmark.
|
data/views/layout.haml
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
%title Scrappy
|
5
5
|
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
6
|
%script{:src=>"https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"}
|
7
|
+
%script{:src=>"#{settings.base_uri}/javascripts/utils.js"}
|
7
8
|
%script{:src=>"#{settings.base_uri}/javascripts/remote.js"}
|
8
9
|
%body
|
9
10
|
#bar
|
data/views/patterns.haml
CHANGED
@@ -3,17 +3,22 @@
|
|
3
3
|
%p
|
4
4
|
Patterns are visual conditions that are used to identify data in sites which do not have a defined extractor.
|
5
5
|
%p
|
6
|
-
-if @
|
6
|
+
-if @patterns.empty?
|
7
7
|
Currently, there are no patterns.
|
8
8
|
-else
|
9
9
|
%ul.detail
|
10
|
-
-@
|
10
|
+
-@patterns.each do |pattern|
|
11
|
+
-uri = pattern.sc::type.first.to_s
|
11
12
|
%li
|
12
13
|
%span.action
|
13
|
-
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(
|
14
|
+
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(pattern.to_s)}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete this pattern?"}
|
14
15
|
X
|
15
16
|
%span.name
|
16
17
|
-if !uri.include?('*')
|
17
|
-
%a{:href=>
|
18
|
+
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(pattern.to_s)}"}=uri
|
18
19
|
-else
|
19
|
-
=uri
|
20
|
+
=uri
|
21
|
+
%p
|
22
|
+
%a{:href=>"#{settings.base_uri}/patterns/visual"} See patterns visually
|
23
|
+
|
|
24
|
+
%a{:href=>"#{settings.base_uri}/patterns", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete all the patterns?"} Delete all patterns
|
data/views/samples.haml
CHANGED
@@ -3,28 +3,52 @@
|
|
3
3
|
%p
|
4
4
|
Sample pages are used to build extractors as well as visual patterns that can be applied to retrieve data
|
5
5
|
from other pages.
|
6
|
-
%
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
%li
|
6
|
+
%form{:method=>:post}
|
7
|
+
%p
|
8
|
+
-if @samples.empty?
|
9
|
+
Currently, there are no samples.
|
10
|
+
-else
|
11
|
+
%ul.detail
|
12
|
+
%li.special
|
13
13
|
%span.action
|
14
|
-
%a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
|
15
|
-
X
|
16
|
-
%span.short_name
|
17
|
-
-if !sample[:uri].include?('*')
|
18
|
-
%a{:href=>sample[:uri]}=sample[:uri]
|
19
|
-
-else
|
20
|
-
=sample[:uri]
|
21
|
-
-[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
|
22
|
-
%span.format
|
23
|
-
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
24
14
|
%span.format
|
25
|
-
%
|
15
|
+
%input.checkall{:type=>:checkbox}
|
26
16
|
%span.format
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
17
|
+
Select all
|
18
|
+
-@samples.each_with_index do |sample,i|
|
19
|
+
%li
|
20
|
+
%span.action
|
21
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
|
22
|
+
X
|
23
|
+
%span.short_name
|
24
|
+
-if !sample[:uri].include?('*')
|
25
|
+
%a{:href=>sample[:uri]}=sample[:uri]
|
26
|
+
-else
|
27
|
+
=sample[:uri]
|
28
|
+
%span.format
|
29
|
+
%input{:type=>:checkbox, :name=>'samples[]', :value=>i}
|
30
|
+
-[['Patterns', :patterns], ['Extractors', :extractors], ['Annotations', :annotations]].reverse.each do |text, action|
|
31
|
+
%span.format
|
32
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
33
|
+
%span.format
|
34
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/raw"} RAW
|
35
|
+
%span.date
|
36
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}"}
|
37
|
+
=sample[:date].strftime("%Y/%m/%d - %H:%M")
|
38
|
+
%p
|
39
|
+
%span.type General:
|
40
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/annotate", :title=>'This will store extractors output as the correct samples output'} Annotate
|
41
|
+
%p
|
42
|
+
%span.type Extractors:
|
43
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/train/extractors", :title=>'This will generate extractors for each of the selected samples'} Train
|
44
|
+
|
|
45
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/optimize/extractors", :title=>'This will generalize extractors to improve the performance on the selected samples'} Optimize
|
46
|
+
|
|
47
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/test/extractors", :title=>'This will test extractors on the selected samples'} Test
|
48
|
+
%p
|
49
|
+
%span.type Patterns:
|
50
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/train/patterns", :title=>'This will generate patterns for each of the selected samples'} Train
|
51
|
+
|
|
52
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/optimize/patterns", :title=>'This will generalize patterns to improve the performance on the selected samples'} Optimize
|
53
|
+
|
|
54
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/test/patterns", :title=>'This will test patterns on the selected samples'} Test
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 4
|
8
|
+
- 0
|
9
|
+
version: 0.4.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-30 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -97,8 +97,8 @@ dependencies:
|
|
97
97
|
segments:
|
98
98
|
- 0
|
99
99
|
- 3
|
100
|
-
-
|
101
|
-
version: 0.3.
|
100
|
+
- 9
|
101
|
+
version: 0.3.9
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id006
|
104
104
|
- !ruby/object:Gem::Dependency
|