scrappy 0.1.23 → 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Rakefile +1 -1
- data/bin/scrappy +6 -6
- data/lib/scrappy/agent/extractor.rb +25 -14
- data/lib/scrappy/agent/visual_agent.rb +11 -8
- data/lib/scrappy/selectors/base_uri.rb +7 -3
- data/lib/scrappy/selectors/css.rb +8 -4
- data/lib/scrappy/selectors/new_uri.rb +30 -18
- data/lib/scrappy/selectors/root.rb +12 -9
- data/lib/scrappy/selectors/section.rb +15 -12
- data/lib/scrappy/selectors/slice.rb +13 -7
- data/lib/scrappy/selectors/uri.rb +11 -7
- data/lib/scrappy/selectors/uri_pattern.rb +12 -8
- data/lib/scrappy/selectors/xpath.rb +22 -19
- data/lib/scrappy/server/public/stylesheets/application.css +20 -9
- data/lib/scrappy/server/views/help.haml +5 -5
- data/lib/scrappy/server/views/home.haml +10 -9
- data/lib/scrappy/support.rb +7 -0
- data/lib/scrappy.rb +1 -1
- data/scrappy.gemspec +5 -5
- metadata +6 -6
data/History.txt
CHANGED
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.0'], ['i18n', '>= 0.4.2'], ['haml', '>= 3.0.24']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -29,7 +29,7 @@ module Scrappy
|
|
29
29
|
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
30
30
|
|
31
31
|
OptionParser.new do |opts|
|
32
|
-
opts.on('-
|
32
|
+
opts.on('-v', '--version') { output_version; exit 0 }
|
33
33
|
opts.on('-h', '--help') { output_help; exit 0 }
|
34
34
|
opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
|
35
35
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
@@ -42,7 +42,7 @@ module Scrappy
|
|
42
42
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
43
43
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
44
44
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
45
|
-
opts.on('-
|
45
|
+
opts.on('-V', '--visual') { Agent::Options.agent = :visual; Agent::Options.workers = 1 }
|
46
46
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
47
47
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
48
48
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
@@ -85,8 +85,8 @@ module Scrappy
|
|
85
85
|
Scrappy is a tool to scrape semantic data out of the unstructured web
|
86
86
|
|
87
87
|
Examples
|
88
|
-
This command retrieves
|
89
|
-
scrappy -g http://www.
|
88
|
+
This command retrieves a web page
|
89
|
+
scrappy -g http://www.example.com
|
90
90
|
|
91
91
|
Usage
|
92
92
|
scrappy [options]
|
@@ -95,7 +95,7 @@ Usage
|
|
95
95
|
|
96
96
|
Options
|
97
97
|
-h, --help Displays help message
|
98
|
-
-
|
98
|
+
-v, --version Display the version, then exit
|
99
99
|
-f, --format Picks output format (json, ejson, rdfxml, ntriples, png)
|
100
100
|
-g, --get URL Gets requested URL
|
101
101
|
-p, --post URL Posts requested URL
|
@@ -108,7 +108,7 @@ Options
|
|
108
108
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
109
109
|
-S, --proxy-server Runs web proxy
|
110
110
|
-P, --port PORT Selects port number (default is 3434)
|
111
|
-
-
|
111
|
+
-V, --visual Uses visual agent (slow)
|
112
112
|
-r, --reference Outputs referenceable data
|
113
113
|
-R, --reference-all Outputs all HTML referenceable data
|
114
114
|
-w, --window Shows browser window (requires -v)
|
@@ -7,12 +7,12 @@ module Scrappy
|
|
7
7
|
print "Extracting #{uri}..."; $stdout.flush
|
8
8
|
end
|
9
9
|
|
10
|
+
@selector_pool ||= {}
|
10
11
|
triples = []
|
11
12
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
12
13
|
|
13
14
|
uri_selectors = (kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).flatten.select do |uri_selector|
|
14
|
-
|
15
|
-
results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
|
15
|
+
results = selector_pool(uri_selector).filter :content=>content, :uri=>uri
|
16
16
|
!results.empty?
|
17
17
|
end
|
18
18
|
|
@@ -41,9 +41,19 @@ module Scrappy
|
|
41
41
|
# Generate triples
|
42
42
|
docs.each do |doc|
|
43
43
|
# Build URIs if identifier present
|
44
|
-
nodes = fragment.sc::identifier.map { |s| filter s, doc
|
44
|
+
nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map do |d|
|
45
|
+
node = Node(parse_uri(uri, d[:value]))
|
46
|
+
if options[:referenceable]
|
47
|
+
# Include the fragment where the URI was built from
|
48
|
+
uri_node = Node(nil)
|
49
|
+
options[:triples] << [ node, Node("sc:uri"), uri_node ]
|
50
|
+
options[:triples] << [ uri_node, Node("rdf:value"), node.to_s ]
|
51
|
+
options[:triples] << [ uri_node, Node("sc:source"), Node(node_hash(d[:uri], d[:content].path)) ]
|
52
|
+
end
|
53
|
+
node
|
54
|
+
end
|
45
55
|
nodes << Node(nil) if nodes.empty?
|
46
|
-
|
56
|
+
|
47
57
|
nodes.each do |node|
|
48
58
|
# Build the object
|
49
59
|
object = if fragment.sc::type.include?(Node('rdf:Literal'))
|
@@ -82,9 +92,6 @@ module Scrappy
|
|
82
92
|
end
|
83
93
|
|
84
94
|
def filter selector, doc
|
85
|
-
# From "BaseUriSelector" to "base_uri"
|
86
|
-
class_name = selector.rdf::type.first.to_s.split('#').last
|
87
|
-
|
88
95
|
if !selector.sc::debug.empty? and options.debug
|
89
96
|
puts '== DEBUG'
|
90
97
|
puts '== Selector:'
|
@@ -96,7 +103,7 @@ module Scrappy
|
|
96
103
|
end
|
97
104
|
|
98
105
|
# Process selector
|
99
|
-
results =
|
106
|
+
results = selector_pool(selector).filter doc
|
100
107
|
|
101
108
|
if !selector.sc::debug.empty? and options.debug
|
102
109
|
puts "== No results" if results.empty?
|
@@ -128,7 +135,7 @@ module Scrappy
|
|
128
135
|
end
|
129
136
|
|
130
137
|
def add_referenceable_data content, triples, referenceable
|
131
|
-
resources = triples.
|
138
|
+
resources = {}; triples.each { |s,p,o| resources[o] = true }
|
132
139
|
|
133
140
|
fragment = Node(node_hash(uri, '/'))
|
134
141
|
selector = Node(nil)
|
@@ -136,7 +143,7 @@ module Scrappy
|
|
136
143
|
|
137
144
|
selector.rdf::type = Node('sc:UnivocalSelector')
|
138
145
|
selector.sc::path = '/'
|
139
|
-
selector.sc::
|
146
|
+
selector.sc::document = uri
|
140
147
|
|
141
148
|
fragment.sc::selector = selector
|
142
149
|
|
@@ -144,15 +151,15 @@ module Scrappy
|
|
144
151
|
|
145
152
|
content.search('*').each do |node|
|
146
153
|
fragment = Node(node_hash(uri, node.path))
|
147
|
-
|
148
|
-
if referenceable == :dump or resources
|
154
|
+
|
155
|
+
if referenceable == :dump or resources[fragment]
|
149
156
|
selector = Node(nil)
|
150
157
|
presentation = Node(nil)
|
151
158
|
|
152
159
|
selector.rdf::type = Node('sc:UnivocalSelector')
|
153
160
|
selector.sc::path = node.path.to_s
|
154
161
|
selector.sc::tag = node.name.to_s
|
155
|
-
selector.sc::
|
162
|
+
selector.sc::document = uri
|
156
163
|
|
157
164
|
presentation.sc::x = node[:vx].to_s if node[:vx]
|
158
165
|
presentation.sc::y = node[:vy].to_s if node[:vy]
|
@@ -175,7 +182,11 @@ module Scrappy
|
|
175
182
|
|
176
183
|
def node_hash uri, path
|
177
184
|
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
178
|
-
"_:bnode#{digest}"
|
185
|
+
:"_:bnode#{digest}"
|
186
|
+
end
|
187
|
+
|
188
|
+
def selector_pool selector
|
189
|
+
@selector_pool[selector.id] ||= kb.node(selector)
|
179
190
|
end
|
180
191
|
end
|
181
192
|
end
|
@@ -63,14 +63,17 @@ module Scrappy
|
|
63
63
|
var i=0;
|
64
64
|
for(var i=0; i<items.length; i++) {
|
65
65
|
var item = items[i];
|
66
|
-
item.setAttribute('vx', item.offsetLeft)
|
67
|
-
item.setAttribute('vy', item.offsetTop)
|
68
|
-
item.setAttribute('vw', item.offsetWidth)
|
69
|
-
item.setAttribute('vh', item.offsetHeight)
|
70
|
-
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'))
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
item.setAttribute('vx', item.offsetLeft);
|
67
|
+
item.setAttribute('vy', item.offsetTop);
|
68
|
+
item.setAttribute('vw', item.offsetWidth);
|
69
|
+
item.setAttribute('vh', item.offsetHeight);
|
70
|
+
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
|
71
|
+
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
72
|
+
if (weight == 'normal') weight = 400;
|
73
|
+
if (weight == 'bold') weight = 700;
|
74
|
+
item.setAttribute('vweight', weight);
|
75
|
+
item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
|
76
|
+
item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
|
74
77
|
}"""
|
75
78
|
end
|
76
79
|
|
@@ -1,5 +1,9 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
1
|
+
module Sc
|
2
|
+
class BaseUriSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:uri] } ]
|
7
|
+
end
|
4
8
|
end
|
5
9
|
end
|
@@ -1,6 +1,10 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
module Sc
|
2
|
+
class CssSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
# By using Nokogiri, CSS and XPath use the same search method
|
7
|
+
Sc::XPathSelector.new(node).filter doc
|
8
|
+
end
|
5
9
|
end
|
6
10
|
end
|
@@ -1,23 +1,35 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
|
13
|
-
suffix = selector.sc::suffix.first.to_s
|
14
|
-
|
15
|
-
contents.map do |content|
|
16
|
-
variable = selector.sc::sequence.first.to_s=="true" ? (@@indexes[selector] += 1) : content.wikify
|
1
|
+
module Sc
|
2
|
+
class NewUriSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
contents = if sc::attribute.first
|
7
|
+
# Select node's attribute if given
|
8
|
+
sc::attribute.map { |attribute| doc[:content][attribute] }
|
9
|
+
else
|
10
|
+
[ doc[:value] ]
|
11
|
+
end
|
17
12
|
|
18
|
-
|
13
|
+
@indexes ||= Hash.new(0)
|
14
|
+
prefix = sc::prefix.first.to_s
|
15
|
+
prefix = (prefix =~ /\Ahttp/ ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}")
|
16
|
+
suffix = sc::suffix.first.to_s
|
19
17
|
|
20
|
-
|
18
|
+
contents.map do |content|
|
19
|
+
variable = if sc::sequence.first.to_s=="true"
|
20
|
+
@indexes[prefix] += 1
|
21
|
+
else
|
22
|
+
if sc::downcase.first.to_s=="true"
|
23
|
+
content.to_s.underscore
|
24
|
+
else
|
25
|
+
content.to_s.wikify
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
new_uri = "#{prefix}#{variable}#{suffix}"
|
30
|
+
|
31
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
|
32
|
+
end
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
@@ -1,12 +1,15 @@
|
|
1
|
-
module
|
2
|
-
|
1
|
+
module Sc
|
2
|
+
class RootSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
include Scrappy::Formats
|
3
5
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
def filter doc
|
7
|
+
if sc::attribute.first
|
8
|
+
# Select node's attribute if given
|
9
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
10
|
+
else
|
11
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], sc::format, doc[:uri]) } ]
|
12
|
+
end
|
10
13
|
end
|
11
14
|
end
|
12
|
-
end
|
15
|
+
end
|
@@ -1,14 +1,17 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
content
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
1
|
+
module Sc
|
2
|
+
class SectionSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
include Scrappy::Formats
|
5
|
+
|
6
|
+
def filter doc
|
7
|
+
rdf::value.map do |pattern|
|
8
|
+
doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
|
9
|
+
found = false
|
10
|
+
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
|
11
|
+
|
12
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
13
|
+
end
|
14
|
+
end.flatten
|
15
|
+
end
|
13
16
|
end
|
14
17
|
end
|
@@ -1,8 +1,14 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module Sc
|
2
|
+
class SliceSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
rdf::value.map do |separator|
|
7
|
+
slices = doc[:value].split(separator)
|
8
|
+
sc::index.map { |index| slices[index.to_i].to_s.strip }.
|
9
|
+
select { |value| value != "" }.
|
10
|
+
map { |value| { :uri=>doc[:uri], :content=>doc[:content], :value=>value} }
|
11
|
+
end.flatten
|
12
|
+
end
|
7
13
|
end
|
8
|
-
end
|
14
|
+
end
|
@@ -1,10 +1,14 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
[]
|
1
|
+
module Sc
|
2
|
+
class UriSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
# Check if the UriSelector has this URI as value (without params: ?param1=value1¶m2=value2)
|
7
|
+
if rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
|
8
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
9
|
+
else
|
10
|
+
[]
|
11
|
+
end
|
8
12
|
end
|
9
13
|
end
|
10
14
|
end
|
@@ -1,10 +1,14 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
[]
|
1
|
+
module Sc
|
2
|
+
class UriPatternSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
|
5
|
+
def filter doc
|
6
|
+
# Check if the uri fits the pattern
|
7
|
+
if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
|
8
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
9
|
+
else
|
10
|
+
[]
|
11
|
+
end
|
8
12
|
end
|
9
13
|
end
|
10
|
-
end
|
14
|
+
end
|
@@ -1,23 +1,26 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
-
patterns = selector.sc::keyword
|
12
|
-
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
13
|
-
if selector.sc::attribute.first
|
14
|
-
# Select node's attribute if given
|
15
|
-
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
1
|
+
module Sc
|
2
|
+
class XPathSelector
|
3
|
+
include RDF::NodeProxy
|
4
|
+
include Scrappy::Formats
|
5
|
+
|
6
|
+
def filter doc
|
7
|
+
rdf::value.map do |pattern|
|
8
|
+
interval = if sc::index.first
|
9
|
+
(sc::index.first.to_i..sc::index.first.to_i)
|
16
10
|
else
|
17
|
-
|
18
|
-
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
|
11
|
+
(0..-1)
|
19
12
|
end
|
20
|
-
|
21
|
-
|
13
|
+
patterns = sc::keyword
|
14
|
+
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
15
|
+
if sc::attribute.first
|
16
|
+
# Select node's attribute if given
|
17
|
+
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
18
|
+
else
|
19
|
+
# Select node
|
20
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, sc::format, doc[:uri]) } ]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end.flatten
|
24
|
+
end
|
22
25
|
end
|
23
26
|
end
|
@@ -1,24 +1,32 @@
|
|
1
1
|
body {
|
2
2
|
font-family: Arial, sans;
|
3
3
|
}
|
4
|
-
|
4
|
+
#center {
|
5
5
|
text-align: center;
|
6
6
|
margin-top: 100px;
|
7
|
+
margin-bottom: 100px;
|
7
8
|
}
|
8
|
-
|
9
|
+
#search {
|
9
10
|
margin-top: 40px;
|
10
|
-
margin-bottom: 100px;
|
11
11
|
font-size:20px;
|
12
|
+
margin-bottom: 10px;
|
12
13
|
}
|
13
|
-
|
14
|
-
width:
|
14
|
+
#search input {
|
15
|
+
width: 700px; height:30px; font-size:16px;
|
15
16
|
}
|
16
|
-
|
17
|
-
width:
|
17
|
+
#buttons {
|
18
|
+
width: 400px;
|
19
|
+
margin: auto;
|
18
20
|
}
|
19
|
-
|
20
|
-
width:
|
21
|
+
#buttons select {
|
22
|
+
width: 100px; height: 30px; font-size:16px;
|
23
|
+
margin-left: 5px;
|
21
24
|
}
|
25
|
+
#buttons button {
|
26
|
+
width: 100px; height: 30px; font-size:16px;
|
27
|
+
margin-right: 5px;
|
28
|
+
}
|
29
|
+
|
22
30
|
pre {
|
23
31
|
width: 600px;
|
24
32
|
margin-left: auto;
|
@@ -37,4 +45,7 @@ pre {
|
|
37
45
|
}
|
38
46
|
#footer {
|
39
47
|
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
48
|
+
}
|
49
|
+
img {
|
50
|
+
border: none;
|
40
51
|
}
|
@@ -2,10 +2,10 @@
|
|
2
2
|
%html
|
3
3
|
%head
|
4
4
|
%title Help - Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
5
|
+
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
6
|
%body
|
7
7
|
%div#header
|
8
|
-
%img{:src=>
|
8
|
+
%img{:src=>"#{settings.base_uri}/images/logo_small.png"}
|
9
9
|
%div#body
|
10
10
|
%h1 Help
|
11
11
|
%p
|
@@ -15,11 +15,11 @@
|
|
15
15
|
%pre http://[host]/[format]/[url]
|
16
16
|
%p
|
17
17
|
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
18
|
-
%pre http://localhost
|
18
|
+
%pre==#{settings.base_uri || "http://localhost:#{settings.port}"}/rdf/http://example.com/~user/%3Ftest%3D1
|
19
19
|
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
20
20
|
%div#footer
|
21
|
-
%a{:href=>"/"} Home
|
21
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
22
22
|
|
|
23
|
-
%a{:href=>"/help"} Help
|
23
|
+
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
24
|
|
|
25
25
|
%a{:href=>'http://github.com/josei/scrappy'} About
|
@@ -2,24 +2,25 @@
|
|
2
2
|
%html
|
3
3
|
%head
|
4
4
|
%title Scrappy
|
5
|
-
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
5
|
+
%link{:type=>"text/css", :href=>"#{settings.base_uri}/stylesheets/application.css", :rel=>"stylesheet"}
|
6
6
|
%body
|
7
|
-
%div
|
7
|
+
%div#center
|
8
8
|
%a{:href=>'http://github.com/josei/scrappy'}
|
9
|
-
%img{:src=>
|
10
|
-
%form
|
11
|
-
%div
|
9
|
+
%img{:src=>"#{settings.base_uri}/images/logo.png"}
|
10
|
+
%form
|
11
|
+
%div#search
|
12
|
+
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
13
|
+
%div#buttons
|
14
|
+
%button Scrape
|
12
15
|
%select{:name=>:format}
|
13
16
|
%option{:value=>:rdf} RDF
|
14
17
|
%option{:value=>:png} PNG
|
15
18
|
%option{:value=>:ejson} JSON
|
16
19
|
%option{:value=>:yarf} YARF
|
17
20
|
%option{:value=>:ntriples} nTriples
|
18
|
-
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
19
|
-
%button Scrape
|
20
21
|
%div#footer
|
21
|
-
%a{:href=>"/"} Home
|
22
|
+
%a{:href=>"#{settings.base_uri}/"} Home
|
22
23
|
|
|
23
|
-
%a{:href=>"/help"} Help
|
24
|
+
%a{:href=>"#{settings.base_uri}/help"} Help
|
24
25
|
|
|
25
26
|
%a{:href=>'http://github.com/josei/scrappy'} About
|
data/lib/scrappy/support.rb
CHANGED
@@ -22,4 +22,11 @@ class String
|
|
22
22
|
def wikify
|
23
23
|
gsub(/^[a-z]|\s+[a-z]/) { |a| a.upcase }.gsub(/\s/, '')
|
24
24
|
end
|
25
|
+
def underscore
|
26
|
+
self.gsub(/::/, '/').
|
27
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
28
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
29
|
+
tr("-", "_").
|
30
|
+
downcase
|
31
|
+
end
|
25
32
|
end
|
data/lib/scrappy.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.24"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-08}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
33
33
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.
|
35
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.0"])
|
36
36
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
37
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
38
38
|
else
|
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
42
42
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
43
43
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
44
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.0"])
|
45
45
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
46
46
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
47
47
|
end
|
@@ -51,7 +51,7 @@ Gem::Specification.new do |s|
|
|
51
51
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
52
52
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
53
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
54
|
-
s.add_dependency(%q<lightrdf>, [">= 0.
|
54
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.0"])
|
55
55
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
56
56
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
57
57
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 24
|
9
|
+
version: 0.1.24
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-08 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -96,9 +96,9 @@ dependencies:
|
|
96
96
|
- !ruby/object:Gem::Version
|
97
97
|
segments:
|
98
98
|
- 0
|
99
|
-
-
|
100
|
-
-
|
101
|
-
version: 0.
|
99
|
+
- 2
|
100
|
+
- 0
|
101
|
+
version: 0.2.0
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id006
|
104
104
|
- !ruby/object:Gem::Dependency
|