scrappy 0.3.5 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Rakefile +1 -1
- data/bin/scrappy +75 -17
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/extractor/extractor.rb +11 -8
- data/lib/scrappy/extractor/fragment.rb +1 -1
- data/lib/scrappy/extractor/selector.rb +6 -2
- data/lib/scrappy/extractor/selectors/uri_pattern.rb +1 -1
- data/lib/scrappy/extractor/selectors/visual.rb +66 -52
- data/lib/scrappy/learning/optimizer.rb +355 -107
- data/lib/scrappy/learning/trainer.rb +112 -40
- data/lib/scrappy/server/admin.rb +180 -17
- data/lib/scrappy/support.rb +0 -24
- data/public/javascripts/annotator.js +1 -1
- data/public/stylesheets/application.css +33 -0
- data/scrappy.gemspec +5 -5
- data/views/help.haml +1 -2
- data/views/layout.haml +1 -0
- data/views/patterns.haml +10 -5
- data/views/samples.haml +46 -22
- metadata +6 -6
@@ -53,8 +53,8 @@ jQuery(document).ready(function(){
|
|
53
53
|
} else {
|
54
54
|
div = "<div id='scrappy_window' title='Scrappy'>" +
|
55
55
|
"<p>No extractor available for this URL</p>" +
|
56
|
-
"<p><a href='TODO'>Annotate page</a></p>" +
|
57
56
|
"<p><a class='extractor' href='http://localhost:3434/extractors'>Generate extractor</a></p>" +
|
57
|
+
"<p><a class='sample' href='http://localhost:3434/samples'>Upload sample</a></p>" +
|
58
58
|
"</div>";
|
59
59
|
}
|
60
60
|
|
@@ -9,6 +9,17 @@ pre {
|
|
9
9
|
border: 1px solid;
|
10
10
|
padding: 10px;
|
11
11
|
}
|
12
|
+
pre.wide {
|
13
|
+
width: 770px;
|
14
|
+
max-height: 900px;
|
15
|
+
margin-left: auto;
|
16
|
+
margin-right: auto;
|
17
|
+
border: 1px solid;
|
18
|
+
font-size: 12px;
|
19
|
+
overflow: scroll;
|
20
|
+
padding: 10px;
|
21
|
+
font-family: monospace;
|
22
|
+
}
|
12
23
|
a:link, a:visited {
|
13
24
|
color: #33f;
|
14
25
|
text-decoration: none;
|
@@ -152,6 +163,9 @@ ul.detail li {
|
|
152
163
|
background-color: #eee;
|
153
164
|
margin: 1px;
|
154
165
|
}
|
166
|
+
ul.detail li.special {
|
167
|
+
background-color: #fff;
|
168
|
+
}
|
155
169
|
ul.detail li span {
|
156
170
|
display: inline-block;
|
157
171
|
}
|
@@ -174,6 +188,9 @@ ul.detail li span.format {
|
|
174
188
|
margin-left: 10px;
|
175
189
|
text-align: center;
|
176
190
|
}
|
191
|
+
ul.detail li.special span.format {
|
192
|
+
font-weight: normal;
|
193
|
+
}
|
177
194
|
ul.detail li span.date {
|
178
195
|
float: right;
|
179
196
|
}
|
@@ -196,3 +213,19 @@ ul.detail li span.date a:visited,
|
|
196
213
|
ul.detail li span.date a:active {
|
197
214
|
color: #888;
|
198
215
|
}
|
216
|
+
|
217
|
+
fieldset {
|
218
|
+
border: 0;
|
219
|
+
padding: 0;
|
220
|
+
margin: 0;
|
221
|
+
vertical-align: baseline;
|
222
|
+
}
|
223
|
+
|
224
|
+
textarea {
|
225
|
+
width: 100%;
|
226
|
+
}
|
227
|
+
|
228
|
+
span.type {
|
229
|
+
float: left;
|
230
|
+
width: 80px;
|
231
|
+
}
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.4.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-06-30}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -31,7 +31,7 @@ Gem::Specification.new do |s|
|
|
31
31
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
32
32
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
33
33
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
34
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.3.
|
34
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.3.9"])
|
35
35
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
36
36
|
s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
|
37
37
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
@@ -42,7 +42,7 @@ Gem::Specification.new do |s|
|
|
42
42
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
43
43
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
44
44
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
45
|
-
s.add_dependency(%q<lightrdf>, [">= 0.3.
|
45
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.9"])
|
46
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
47
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
48
48
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
@@ -54,7 +54,7 @@ Gem::Specification.new do |s|
|
|
54
54
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
55
55
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
56
56
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
57
|
-
s.add_dependency(%q<lightrdf>, [">= 0.3.
|
57
|
+
s.add_dependency(%q<lightrdf>, [">= 0.3.9"])
|
58
58
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
59
59
|
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
60
60
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
data/views/help.haml
CHANGED
@@ -16,5 +16,4 @@
|
|
16
16
|
Drag this to your bookmarks:
|
17
17
|
%a.bookmark{:href=>bookmark_js, :onclick=>drag_js} Scrappy
|
18
18
|
%p
|
19
|
-
Then visit the web page you want to build a extractor for.
|
20
|
-
Click on your "Scrappy" bookmark and annotate the web page.
|
19
|
+
Then visit the web page you want to build a extractor for and click on your "Scrappy" bookmark.
|
data/views/layout.haml
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
%title Scrappy
|
5
5
|
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
6
|
%script{:src=>"https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"}
|
7
|
+
%script{:src=>"#{settings.base_uri}/javascripts/utils.js"}
|
7
8
|
%script{:src=>"#{settings.base_uri}/javascripts/remote.js"}
|
8
9
|
%body
|
9
10
|
#bar
|
data/views/patterns.haml
CHANGED
@@ -3,17 +3,22 @@
|
|
3
3
|
%p
|
4
4
|
Patterns are visual conditions that are used to identify data in sites which do not have a defined extractor.
|
5
5
|
%p
|
6
|
-
-if @
|
6
|
+
-if @patterns.empty?
|
7
7
|
Currently, there are no patterns.
|
8
8
|
-else
|
9
9
|
%ul.detail
|
10
|
-
-@
|
10
|
+
-@patterns.each do |pattern|
|
11
|
+
-uri = pattern.sc::type.first.to_s
|
11
12
|
%li
|
12
13
|
%span.action
|
13
|
-
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(
|
14
|
+
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(pattern.to_s)}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete this pattern?"}
|
14
15
|
X
|
15
16
|
%span.name
|
16
17
|
-if !uri.include?('*')
|
17
|
-
%a{:href=>
|
18
|
+
%a{:href=>"#{settings.base_uri}/patterns/#{CGI::escape(pattern.to_s)}"}=uri
|
18
19
|
-else
|
19
|
-
=uri
|
20
|
+
=uri
|
21
|
+
%p
|
22
|
+
%a{:href=>"#{settings.base_uri}/patterns/visual"} See patterns visually
|
23
|
+
|
|
24
|
+
%a{:href=>"#{settings.base_uri}/patterns", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete all the patterns?"} Delete all patterns
|
data/views/samples.haml
CHANGED
@@ -3,28 +3,52 @@
|
|
3
3
|
%p
|
4
4
|
Sample pages are used to build extractors as well as visual patterns that can be applied to retrieve data
|
5
5
|
from other pages.
|
6
|
-
%
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
%li
|
6
|
+
%form{:method=>:post}
|
7
|
+
%p
|
8
|
+
-if @samples.empty?
|
9
|
+
Currently, there are no samples.
|
10
|
+
-else
|
11
|
+
%ul.detail
|
12
|
+
%li.special
|
13
13
|
%span.action
|
14
|
-
%a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
|
15
|
-
X
|
16
|
-
%span.short_name
|
17
|
-
-if !sample[:uri].include?('*')
|
18
|
-
%a{:href=>sample[:uri]}=sample[:uri]
|
19
|
-
-else
|
20
|
-
=sample[:uri]
|
21
|
-
-[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
|
22
|
-
%span.format
|
23
|
-
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
24
14
|
%span.format
|
25
|
-
%
|
15
|
+
%input.checkall{:type=>:checkbox}
|
26
16
|
%span.format
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
17
|
+
Select all
|
18
|
+
-@samples.each_with_index do |sample,i|
|
19
|
+
%li
|
20
|
+
%span.action
|
21
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
|
22
|
+
X
|
23
|
+
%span.short_name
|
24
|
+
-if !sample[:uri].include?('*')
|
25
|
+
%a{:href=>sample[:uri]}=sample[:uri]
|
26
|
+
-else
|
27
|
+
=sample[:uri]
|
28
|
+
%span.format
|
29
|
+
%input{:type=>:checkbox, :name=>'samples[]', :value=>i}
|
30
|
+
-[['Patterns', :patterns], ['Extractors', :extractors], ['Annotations', :annotations]].reverse.each do |text, action|
|
31
|
+
%span.format
|
32
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
33
|
+
%span.format
|
34
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/raw"} RAW
|
35
|
+
%span.date
|
36
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}"}
|
37
|
+
=sample[:date].strftime("%Y/%m/%d - %H:%M")
|
38
|
+
%p
|
39
|
+
%span.type General:
|
40
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/annotate", :title=>'This will store extractors output as the correct samples output'} Annotate
|
41
|
+
%p
|
42
|
+
%span.type Extractors:
|
43
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/train/extractors", :title=>'This will generate extractors for each of the selected samples'} Train
|
44
|
+
|
|
45
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/optimize/extractors", :title=>'This will generalize extractors to improve the performance on the selected samples'} Optimize
|
46
|
+
|
|
47
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/test/extractors", :title=>'This will test extractors on the selected samples'} Test
|
48
|
+
%p
|
49
|
+
%span.type Patterns:
|
50
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/train/patterns", :title=>'This will generate patterns for each of the selected samples'} Train
|
51
|
+
|
|
52
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/optimize/patterns", :title=>'This will generalize patterns to improve the performance on the selected samples'} Optimize
|
53
|
+
|
|
54
|
+
%a.checksend{:href=>"#{settings.base_uri}/samples/test/patterns", :title=>'This will test patterns on the selected samples'} Test
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 4
|
8
|
+
- 0
|
9
|
+
version: 0.4.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-30 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -97,8 +97,8 @@ dependencies:
|
|
97
97
|
segments:
|
98
98
|
- 0
|
99
99
|
- 3
|
100
|
-
-
|
101
|
-
version: 0.3.
|
100
|
+
- 9
|
101
|
+
version: 0.3.9
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id006
|
104
104
|
- !ruby/object:Gem::Dependency
|