scrappy 0.1.23 → 0.1.24
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Rakefile +1 -1
- data/bin/scrappy +6 -6
- data/lib/scrappy/agent/extractor.rb +25 -14
- data/lib/scrappy/agent/visual_agent.rb +11 -8
- data/lib/scrappy/selectors/base_uri.rb +7 -3
- data/lib/scrappy/selectors/css.rb +8 -4
- data/lib/scrappy/selectors/new_uri.rb +30 -18
- data/lib/scrappy/selectors/root.rb +12 -9
- data/lib/scrappy/selectors/section.rb +15 -12
- data/lib/scrappy/selectors/slice.rb +13 -7
- data/lib/scrappy/selectors/uri.rb +11 -7
- data/lib/scrappy/selectors/uri_pattern.rb +12 -8
- data/lib/scrappy/selectors/xpath.rb +22 -19
- data/lib/scrappy/server/public/stylesheets/application.css +20 -9
- data/lib/scrappy/server/views/help.haml +5 -5
- data/lib/scrappy/server/views/home.haml +10 -9
- data/lib/scrappy/support.rb +7 -0
- data/lib/scrappy.rb +1 -1
- data/scrappy.gemspec +5 -5
- metadata +6 -6
data/History.txt
CHANGED
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.0'], ['i18n', '>= 0.4.2'], ['haml', '>= 3.0.24']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -29,7 +29,7 @@ module Scrappy
|
|
29
29
|
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
30
30
|
|
31
31
|
OptionParser.new do |opts|
|
32
|
-
opts.on('-
|
32
|
+
opts.on('-v', '--version') { output_version; exit 0 }
|
33
33
|
opts.on('-h', '--help') { output_help; exit 0 }
|
34
34
|
opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
|
35
35
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
@@ -42,7 +42,7 @@ module Scrappy
|
|
42
42
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
43
43
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
44
44
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
45
|
-
opts.on('-
|
45
|
+
opts.on('-V', '--visual') { Agent::Options.agent = :visual; Agent::Options.workers = 1 }
|
46
46
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
47
47
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
48
48
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
@@ -85,8 +85,8 @@ module Scrappy
|
|
85
85
|
Scrappy is a tool to scrape semantic data out of the unstructured web
|
86
86
|
|
87
87
|
Examples
|
88
|
-
This command retrieves
|
89
|
-
scrappy -g http://www.
|
88
|
+
This command retrieves a web page
|
89
|
+
scrappy -g http://www.example.com
|
90
90
|
|
91
91
|
Usage
|
92
92
|
scrappy [options]
|
@@ -95,7 +95,7 @@ Usage
|
|
95
95
|
|
96
96
|
Options
|
97
97
|
-h, --help Displays help message
|
98
|
-
-
|
98
|
+
-v, --version Display the version, then exit
|
99
99
|
-f, --format Picks output format (json, ejson, rdfxml, ntriples, png)
|
100
100
|
-g, --get URL Gets requested URL
|
101
101
|
-p, --post URL Posts requested URL
|
@@ -108,7 +108,7 @@ Options
|
|
108
108
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
109
109
|
-S, --proxy-server Runs web proxy
|
110
110
|
-P, --port PORT Selects port number (default is 3434)
|
111
|
-
-
|
111
|
+
-V, --visual Uses visual agent (slow)
|
112
112
|
-r, --reference Outputs referenceable data
|
113
113
|
-R, --reference-all Outputs all HTML referenceable data
|
114
114
|
-w, --window Shows browser window (requires -v)
|
@@ -7,12 +7,12 @@ module Scrappy
|
|
7
7
|
print "Extracting #{uri}..."; $stdout.flush
|
8
8
|
end
|
9
9
|
|
10
|
+
@selector_pool ||= {}
|
10
11
|
triples = []
|
11
12
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
12
13
|
|
13
14
|
uri_selectors = (kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).flatten.select do |uri_selector|
|
14
|
-
|
15
|
-
results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
|
15
|
+
results = selector_pool(uri_selector).filter :content=>content, :uri=>uri
|
16
16
|
!results.empty?
|
17
17
|
end
|
18
18
|
|
@@ -41,9 +41,19 @@ module Scrappy
|
|
41
41
|
# Generate triples
|
42
42
|
docs.each do |doc|
|
43
43
|
# Build URIs if identifier present
|
44
|
-
nodes = fragment.sc::identifier.map { |s| filter s, doc
|
44
|
+
nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map do |d|
|
45
|
+
node = Node(parse_uri(uri, d[:value]))
|
46
|
+
if options[:referenceable]
|
47
|
+
# Include the fragment where the URI was built from
|
48
|
+
uri_node = Node(nil)
|
49
|
+
options[:triples] << [ node, Node("sc:uri"), uri_node ]
|
50
|
+
options[:triples] << [ uri_node, Node("rdf:value"), node.to_s ]
|
51
|
+
options[:triples] << [ uri_node, Node("sc:source"), Node(node_hash(d[:uri], d[:content].path)) ]
|
52
|
+
end
|
53
|
+
node
|
54
|
+
end
|
45
55
|
nodes << Node(nil) if nodes.empty?
|
46
|
-
|
56
|
+
|
47
57
|
nodes.each do |node|
|
48
58
|
# Build the object
|
49
59
|
object = if fragment.sc::type.include?(Node('rdf:Literal'))
|
@@ -82,9 +92,6 @@ module Scrappy
|
|
82
92
|
end
|
83
93
|
|
84
94
|
def filter selector, doc
|
85
|
-
# From "BaseUriSelector" to "base_uri"
|
86
|
-
class_name = selector.rdf::type.first.to_s.split('#').last
|
87
|
-
|
88
95
|
if !selector.sc::debug.empty? and options.debug
|
89
96
|
puts '== DEBUG'
|
90
97
|
puts '== Selector:'
|
@@ -96,7 +103,7 @@ module Scrappy
|
|
96
103
|
end
|
97
104
|
|
98
105
|
# Process selector
|
99
|
-
results =
|
106
|
+
results = selector_pool(selector).filter doc
|
100
107
|
|
101
108
|
if !selector.sc::debug.empty? and options.debug
|
102
109
|
puts "== No results" if results.empty?
|
@@ -128,7 +135,7 @@ module Scrappy
|
|
128
135
|
end
|
129
136
|
|
130
137
|
def add_referenceable_data content, triples, referenceable
|
131
|
-
resources = triples.
|
138
|
+
resources = {}; triples.each { |s,p,o| resources[o] = true }
|
132
139
|
|
133
140
|
fragment = Node(node_hash(uri, '/'))
|
134
141
|
selector = Node(nil)
|
@@ -136,7 +143,7 @@ module Scrappy
|
|
136
143
|
|
137
144
|
selector.rdf::type = Node('sc:UnivocalSelector')
|
138
145
|
selector.sc::path = '/'
|
139
|
-
selector.sc::
|
146
|
+
selector.sc::document = uri
|
140
147
|
|
141
148
|
fragment.sc::selector = selector
|
142
149
|
|
@@ -144,15 +151,15 @@ module Scrappy
|
|
144
151
|
|
145
152
|
content.search('*').each do |node|
|
146
153
|
fragment = Node(node_hash(uri, node.path))
|
147
|
-
|
148
|
-
if referenceable == :dump or resources
|
154
|
+
|
155
|
+
if referenceable == :dump or resources[fragment]
|
149
156
|
selector = Node(nil)
|
150
157
|
presentation = Node(nil)
|
151
158
|
|
152
159
|
selector.rdf::type = Node('sc:UnivocalSelector')
|
153
160
|
selector.sc::path = node.path.to_s
|
154
161
|
selector.sc::tag = node.name.to_s
|
155
|
-
selector.sc::
|
162
|
+
selector.sc::document = uri
|
156
163
|
|
157
164
|
presentation.sc::x = node[:vx].to_s if node[:vx]
|
158
165
|
presentation.sc::y = node[:vy].to_s if node[:vy]
|
@@ -175,7 +182,11 @@ module Scrappy
|
|
175
182
|
|
176
183
|
def node_hash uri, path
|
177
184
|
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
178
|
-
"_:bnode#{digest}"
|
185
|
+
:"_:bnode#{digest}"
|
186
|
+
end
|
187
|
+
|
188
|
+
def selector_pool selector
|
189
|
+
@selector_pool[selector.id] ||= kb.node(selector)
|
179
190
|
end
|
180
191
|
end
|
181
192
|
end
|
@@ -63,14 +63,17 @@ module Scrappy
|
|
63
63
|
var i=0;
|
64
64
|
for(var i=0; i<items.length; i++) {
|
65
65
|
var item = items[i];
|
66
|
-
item.setAttribute('vx', item.offsetLeft)
|
67
|
-
item.setAttribute('vy', item.offsetTop)
|
68
|
-
item.setAttribute('vw', item.offsetWidth)
|
69
|
-
item.setAttribute('vh', item.offsetHeight)
|
70
|
-
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'))
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
item.setAttribute('vx', item.offsetLeft);
|
67
|
+
item.setAttribute('vy', item.offsetTop);
|
68
|
+
item.setAttribute('vw', item.offsetWidth);
|
69
|
+
item.setAttribute('vh', item.offsetHeight);
|
70
|
+
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
|
71
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
72
|
+
if (weight == 'normal') weight = 400;
|
73
|
+
if (weight == 'bold') weight = 700;
|
74
|
+
item.setAttribute('vweight', weight);
|
75
|
+
item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
|
76
|
+
item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
|
74
77
|
}"""
|
75
78
|
end
|
76
79
|
|
@@ -1,5 +1,9 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
1
|
+
module Sc
|
2
|
+
class BaseUriSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:uri] } ]
|
7
|
+
end
|
4
8
|
end
|
5
9
|
end
|
@@ -1,6 +1,10 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
module Sc
|
2
|
+
class CssSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
# By using Nokogiri, CSS and XPath use the same search method
|
7
|
+
Sc::XPathSelector.new(node).filter doc
|
8
|
+
end
|
5
9
|
end
|
6
10
|
end
|
@@ -1,23 +1,35 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
|
13
|
-
suffix = selector.sc::suffix.first.to_s
|
14
|
-
|
15
|
-
contents.map do |content|
|
16
|
-
variable = selector.sc::sequence.first.to_s=="true" ? (@@indexes[selector] += 1) : content.wikify
|
1
|
+
module Sc
|
2
|
+
class NewUriSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
contents = if sc::attribute.first
|
7
|
+
# Select node's attribute if given
|
8
|
+
sc::attribute.map { |attribute| doc[:content][attribute] }
|
9
|
+
else
|
10
|
+
[ doc[:value] ]
|
11
|
+
end
|
17
12
|
|
18
|
-
|
13
|
+
@indexes ||= Hash.new(0)
|
14
|
+
prefix = sc::prefix.first.to_s
|
15
|
+
prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
|
16
|
+
suffix = sc::suffix.first.to_s
|
19
17
|
|
20
|
-
|
18
|
+
contents.map do |content|
|
19
|
+
variable = if sc::sequence.first.to_s=="true"
|
20
|
+
@indexes[prefix] += 1
|
21
|
+
else
|
22
|
+
if sc::downcase.first.to_s=="true"
|
23
|
+
content.to_s.underscore
|
24
|
+
else
|
25
|
+
content.to_s.wikify
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
new_uri = "#{prefix}#{variable}#{suffix}"
|
30
|
+
|
31
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
|
32
|
+
end
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
@@ -1,12 +1,15 @@
|
|
1
|
-
module
|
2
|
-
|
1
|
+
module Sc
|
2
|
+
class RootSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
include Scrappy::Formats
|
3
5
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
def filter doc
|
7
|
+
if sc::attribute.first
|
8
|
+
# Select node's attribute if given
|
9
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
10
|
+
else
|
11
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], sc::format, doc[:uri]) } ]
|
12
|
+
end
|
10
13
|
end
|
11
14
|
end
|
12
|
-
end
|
15
|
+
end
|
@@ -1,14 +1,17 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
content
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
1
|
+
module Sc
|
2
|
+
class SectionSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
include Scrappy::Formats
|
5
|
+
|
6
|
+
def filter doc
|
7
|
+
rdf::value.map do |pattern|
|
8
|
+
doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
|
9
|
+
found = false
|
10
|
+
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
|
11
|
+
|
12
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
13
|
+
end
|
14
|
+
end.flatten
|
15
|
+
end
|
13
16
|
end
|
14
17
|
end
|
@@ -1,8 +1,14 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module Sc
|
2
|
+
class SliceSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
rdf::value.map do |separator|
|
7
|
+
slices = doc[:value].split(separator)
|
8
|
+
sc::index.map { |index| slices[index.to_i].to_s.strip }.
|
9
|
+
select { |value| value != "" }.
|
10
|
+
map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
|
11
|
+
end.flatten
|
12
|
+
end
|
7
13
|
end
|
8
|
-
end
|
14
|
+
end
|
@@ -1,10 +1,14 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
[]
|
1
|
+
module Sc
|
2
|
+
class UriSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
# Check if the UriSelector has this URI as value (without params: ?param1=value1¶m2=value2)
|
7
|
+
if rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
|
8
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
9
|
+
else
|
10
|
+
[]
|
11
|
+
end
|
8
12
|
end
|
9
13
|
end
|
10
14
|
end
|
@@ -1,10 +1,14 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
[]
|
1
|
+
module Sc
|
2
|
+
class UriPatternSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
# Check if the uri fits the pattern
|
7
|
+
if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
|
8
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
9
|
+
else
|
10
|
+
[]
|
11
|
+
end
|
8
12
|
end
|
9
13
|
end
|
10
|
-
end
|
14
|
+
end
|
@@ -1,23 +1,26 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
-
patterns = selector.sc::keyword
|
12
|
-
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
13
|
-
if selector.sc::attribute.first
|
14
|
-
# Select node's attribute if given
|
15
|
-
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
1
|
+
module Sc
|
2
|
+
class XPathSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
include Scrappy::Formats
|
5
|
+
|
6
|
+
def filter doc
|
7
|
+
rdf::value.map do |pattern|
|
8
|
+
interval = if sc::index.first
|
9
|
+
(sc::index.first.to_i..sc::index.first.to_i)
|
16
10
|
else
|
17
|
-
|
18
|
-
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
|
11
|
+
(0..-1)
|
19
12
|
end
|
20
|
-
|
21
|
-
|
13
|
+
patterns = sc::keyword
|
14
|
+
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
15
|
+
if sc::attribute.first
|
16
|
+
# Select node's attribute if given
|
17
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
18
|
+
else
|
19
|
+
# Select node
|
20
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end.flatten
|
24
|
+
end
|
22
25
|
end
|
23
26
|
end
|
@@ -1,24 +1,32 @@
|
|
1
1
|
body {
|
2
2
|
font-family: Arial, sans;
|
3
3
|
}
|
4
|
-
|
4
|
+
#center {
|
5
5
|
text-align: center;
|
6
6
|
margin-top: 100px;
|
7
|
+
margin-bottom: 100px;
|
7
8
|
}
|
8
|
-
|
9
|
+
#search {
|
9
10
|
margin-top: 40px;
|
10
|
-
margin-bottom: 100px;
|
11
11
|
font-size:20px;
|
12
|
+
margin-bottom: 10px;
|
12
13
|
}
|
13
|
-
|
14
|
-
width:
|
14
|
+
#search input {
|
15
|
+
width: 700px; height:30px; font-size:16px;
|
15
16
|
}
|
16
|
-
|
17
|
-
width:
|
17
|
+
#buttons {
|
18
|
+
width: 400px;
|
19
|
+
margin: auto;
|
18
20
|
}
|
19
|
-
|
20
|
-
width:
|
21
|
+
#buttons select {
|
22
|
+
width: 100px; height: 30px; font-size:16px;
|
23
|
+
margin-left: 5px;
|
21
24
|
}
|
25
|
+
#buttons button {
|
26
|
+
width: 100px; height: 30px; font-size:16px;
|
27
|
+
margin-right: 5px;
|
28
|
+
}
|
29
|
+
|
22
30
|
pre {
|
23
31
|
width: 600px;
|
24
32
|
margin-left: auto;
|
@@ -37,4 +45,7 @@ pre {
|
|
37
45
|
}
|
38
46
|
#footer {
|
39
47
|
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
48
|
+
}
|
49
|
+
img {
|
50
|
+
border: none;
|
40
51
|
}
|
@@ -2,10 +2,10 @@
|
|
2
2
|
%html
|
3
3
|
%head
|
4
4
|
%title Help - Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
5
|
+
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
6
|
%body
|
7
7
|
%div#header
|
8
|
-
%img{:src=>
|
8
|
+
%img{:src=>"#{settings.base_uri}/images/logo_small.png"}
|
9
9
|
%div#body
|
10
10
|
%h1 Help
|
11
11
|
%p
|
@@ -15,11 +15,11 @@
|
|
15
15
|
%pre http://[host]/[format]/[url]
|
16
16
|
%p
|
17
17
|
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
18
|
-
%pre http://localhost
|
18
|
+
%pre==#{settings.base_uri || "http://localhost:#{settings.port}"}/rdf/http://example.com/~user/%3Ftest%3D1
|
19
19
|
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
20
20
|
%div#footer
|
21
|
-
%a{:href=>"/"} Home
|
21
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
22
22
|
|
|
23
|
-
%a{:href=>"/help"} Help
|
23
|
+
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
24
|
|
|
25
25
|
%a{:href=>'http://github.com/josei/scrappy'} About
|
@@ -2,24 +2,25 @@
|
|
2
2
|
%html
|
3
3
|
%head
|
4
4
|
%title Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
5
|
+
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
6
|
%body
|
7
|
-
%div
|
7
|
+
%div#center
|
8
8
|
%a{:href=>'http://github.com/josei/scrappy'}
|
9
|
-
%img{:src=>
|
10
|
-
%form
|
11
|
-
%div
|
9
|
+
%img{:src=>"#{settings.base_uri}/images/logo.png"}
|
10
|
+
%form
|
11
|
+
%div#search
|
12
|
+
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
13
|
+
%div#buttons
|
14
|
+
%button Scrape
|
12
15
|
%select{:name=>:format}
|
13
16
|
%option{:value=>:rdf} RDF
|
14
17
|
%option{:value=>:png} PNG
|
15
18
|
%option{:value=>:ejson} JSON
|
16
19
|
%option{:value=>:yarf} YARF
|
17
20
|
%option{:value=>:ntriples} nTriples
|
18
|
-
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
19
|
-
%button Scrape
|
20
21
|
%div#footer
|
21
|
-
%a{:href=>"/"} Home
|
22
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
22
23
|
|
|
23
|
-
%a{:href=>"/help"} Help
|
24
|
+
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
25
|
|
|
25
26
|
%a{:href=>'http://github.com/josei/scrappy'} About
|
data/lib/scrappy/support.rb
CHANGED
@@ -22,4 +22,11 @@ class String
|
|
22
22
|
def wikify
|
23
23
|
gsub(/^[a-z]|\s+[a-z]/) { |a| a.upcase }.gsub(/\s/, '')
|
24
24
|
end
|
25
|
+
def underscore
|
26
|
+
self.gsub(/::/, '/').
|
27
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
28
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
29
|
+
tr("-", "_").
|
30
|
+
downcase
|
31
|
+
end
|
25
32
|
end
|
data/lib/scrappy.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.24"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-08}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
33
33
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.
|
35
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.0"])
|
36
36
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
37
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
38
38
|
else
|
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
42
42
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
43
43
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
44
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.0"])
|
45
45
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
46
46
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
47
47
|
end
|
@@ -51,7 +51,7 @@ Gem::Specification.new do |s|
|
|
51
51
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
52
52
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
53
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
54
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
54
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.0"])
|
55
55
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
56
56
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
57
57
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 24
|
9
|
+
version: 0.1.24
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-08 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -96,9 +96,9 @@ dependencies:
|
|
96
96
|
- !ruby/object:Gem::Version
|
97
97
|
segments:
|
98
98
|
- 0
|
99
|
-
-
|
100
|
-
-
|
101
|
-
version: 0.
|
99
|
+
- 2
|
100
|
+
- 0
|
101
|
+
version: 0.2.0
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id006
|
104
104
|
- !ruby/object:Gem::Dependency
|