rubyretriever 1.0.3 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +41 -41
- data/lib/retriever/cli.rb +20 -26
- data/lib/retriever/fetch.rb +209 -186
- data/lib/retriever/fetchfiles.rb +65 -60
- data/lib/retriever/fetchseo.rb +20 -18
- data/lib/retriever/fetchsitemap.rb +37 -32
- data/lib/retriever/link.rb +6 -2
- data/lib/retriever/openuri-redirect-patch.rb +3 -2
- data/lib/retriever/page.rb +20 -21
- data/lib/retriever/target.rb +22 -30
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +2 -2
- data/readme.md +5 -4
- data/spec/page_spec.rb +6 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
|
4
|
+
data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
|
7
|
+
data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64
|
data/bin/rr
CHANGED
@@ -4,73 +4,73 @@ require 'retriever'
|
|
4
4
|
require 'optparse'
|
5
5
|
|
6
6
|
options = {}
|
7
|
-
optparse = OptionParser.new do|opts|
|
7
|
+
optparse = OptionParser.new do |opts|
|
8
8
|
# Set a banner, displayed at the top
|
9
9
|
# of the help screen.
|
10
|
-
opts.banner =
|
11
|
-
|
12
|
-
opts.on(
|
13
|
-
options[
|
10
|
+
opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
|
11
|
+
options['sitemap'] = false
|
12
|
+
opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
|
13
|
+
options['sitemap'] = output_type || ''
|
14
14
|
end
|
15
|
-
|
16
|
-
opts.on(
|
17
|
-
options[
|
15
|
+
options['fileharvest'] = false
|
16
|
+
opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
|
17
|
+
options['fileharvest'] = file_ext
|
18
18
|
end
|
19
|
-
options[
|
20
|
-
opts.on(
|
21
|
-
options[
|
19
|
+
options['seo'] = false
|
20
|
+
opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
|
21
|
+
options['seo'] = true
|
22
22
|
end
|
23
|
-
|
24
|
-
opts.on(
|
25
|
-
options[
|
23
|
+
options['filename'] = nil
|
24
|
+
opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
|
25
|
+
options['filename'] = filename
|
26
26
|
end
|
27
27
|
# Define the options, and what they do
|
28
|
-
options[
|
29
|
-
opts.on(
|
30
|
-
options[
|
28
|
+
options['verbose'] = false
|
29
|
+
opts.on('-v', '--verbose', 'Output more information') do
|
30
|
+
options['verbose'] = true
|
31
31
|
end
|
32
|
-
options[
|
33
|
-
opts.on(
|
34
|
-
options[
|
32
|
+
options['progress'] = false
|
33
|
+
opts.on('-p', '--progress', 'Output progress bar') do
|
34
|
+
options['progress'] = true
|
35
35
|
end
|
36
|
-
|
37
|
-
opts.on(
|
36
|
+
options['maxpages'] = false
|
37
|
+
opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
|
38
38
|
options[:maxpages] = maxpages
|
39
39
|
end
|
40
|
-
options[
|
41
|
-
opts.on(
|
40
|
+
options['autodown'] = false
|
41
|
+
opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
|
42
42
|
options[:autodown] = true
|
43
43
|
end
|
44
44
|
# This displays the help screen, all programs are
|
45
45
|
# assumed to have this option.
|
46
|
-
opts.on(
|
46
|
+
opts.on('-h', '--help', 'Display this screen') do
|
47
47
|
puts opts
|
48
48
|
exit
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
52
|
-
|
51
|
+
|
52
|
+
optparse.parse!
|
53
53
|
if ARGV[0].nil?
|
54
|
-
|
54
|
+
abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
|
55
55
|
end
|
56
56
|
|
57
57
|
ARGV.each do|q|
|
58
58
|
if options[:verbose]
|
59
|
-
puts
|
60
|
-
puts
|
61
|
-
puts
|
62
|
-
puts "### Outputting in format: #{options[
|
63
|
-
puts
|
64
|
-
puts "### Searching for file extension: #{options[
|
65
|
-
puts
|
66
|
-
puts "### Writing output to filename: #{options[
|
67
|
-
puts
|
68
|
-
puts "### Stopping after #{options[
|
59
|
+
puts '###############################'
|
60
|
+
puts '### [RubyRetriever]'
|
61
|
+
puts '### Creating Sitemap' if options['sitemap']
|
62
|
+
puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
|
63
|
+
puts '### Performing File Harvest' if options['fileharvest']
|
64
|
+
puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
|
65
|
+
puts '### Performing SEO Scrape' if options['seo']
|
66
|
+
puts "### Writing output to filename: #{options['filename']}" if options['filename']
|
67
|
+
puts '### Being verbose'
|
68
|
+
puts "### Stopping after #{options['maxpages']} pages"
|
69
69
|
end
|
70
|
-
puts
|
70
|
+
puts '###############################'
|
71
71
|
puts "### [RubyRetriever] go fetch #{q}"
|
72
72
|
Retriever::CLI.new(q, options)
|
73
|
-
puts
|
74
|
-
puts
|
73
|
+
puts '### [RubyRetriever] is done.'
|
74
|
+
puts '###############################'
|
75
75
|
puts
|
76
76
|
end
|
data/lib/retriever/cli.rb
CHANGED
@@ -1,27 +1,21 @@
|
|
1
1
|
module Retriever
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
#sitemap only
|
24
|
-
@fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
2
|
+
#
|
3
|
+
class CLI
|
4
|
+
def initialize(url, options)
|
5
|
+
# kick off the fetch mode of choice
|
6
|
+
if options['fileharvest']
|
7
|
+
@fetch = Retriever::FetchFiles.new(url, options)
|
8
|
+
elsif options['sitemap']
|
9
|
+
@fetch = Retriever::FetchSitemap.new(url, options)
|
10
|
+
elsif options['seo']
|
11
|
+
@fetch = Retriever::FetchSEO.new(url, options)
|
12
|
+
else
|
13
|
+
fail '### Error: No Mode Selected'
|
14
|
+
end
|
15
|
+
@fetch.dump
|
16
|
+
@fetch.write if options['filename']
|
17
|
+
@fetch.autodownload if options['autodown'] && options['fileharvest']
|
18
|
+
@fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/retriever/fetch.rb
CHANGED
@@ -7,191 +7,214 @@ require 'csv'
|
|
7
7
|
require 'bloomfilter-rb'
|
8
8
|
|
9
9
|
module Retriever
|
10
|
-
|
11
|
-
|
10
|
+
#
|
11
|
+
class Fetch
|
12
|
+
attr_reader :max_pages, :t
|
13
|
+
# given target URL and RR options, creates a fetch object.
|
14
|
+
# There is no direct output
|
15
|
+
# this is a parent class that the other fetch classes build off of.
|
16
|
+
def initialize(url, options)
|
17
|
+
@connection_tally = {
|
18
|
+
:success => 0,
|
19
|
+
:error => 0,
|
20
|
+
:error_client => 0,
|
21
|
+
:error_server => 0
|
22
|
+
}
|
23
|
+
# OPTIONS
|
24
|
+
@prgrss = options['progress']
|
25
|
+
@max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
|
26
|
+
@v = options['verbose']
|
27
|
+
@output = options['filename']
|
28
|
+
@fh = options['fileharvest']
|
29
|
+
@file_ext = @fh.to_s
|
30
|
+
@s = options['sitemap']
|
31
|
+
@seo = options['seo']
|
32
|
+
@autodown = options['autodown']
|
33
|
+
#
|
34
|
+
if @fh
|
35
|
+
temp_ext_str = '.' + @file_ext + '\z'
|
36
|
+
@file_re = Regexp.new(temp_ext_str).freeze
|
37
|
+
else
|
38
|
+
# when FH is not true, and autodown is true
|
39
|
+
errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
|
40
|
+
end
|
41
|
+
if @prgrss
|
42
|
+
# verbose & progressbar conflict
|
43
|
+
errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
|
44
|
+
prgress_vars = {
|
45
|
+
:title => 'Pages',
|
46
|
+
:starting_at => 1,
|
47
|
+
:total => @max_pages,
|
48
|
+
:format => '%a |%b>%i| %c/%C %t'
|
49
|
+
}
|
50
|
+
@progressbar = ProgressBar.create(prgress_vars)
|
51
|
+
end
|
52
|
+
@t = Retriever::Target.new(url, @file_re)
|
53
|
+
@output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
|
54
|
+
@already_crawled = BloomFilter::Native.new(
|
55
|
+
:size => 1_000_000,
|
56
|
+
:hashes => 5,
|
57
|
+
:seed => 1,
|
58
|
+
:bucket => 8,
|
59
|
+
:raise => false
|
60
|
+
)
|
61
|
+
@already_crawled.insert(@t.target)
|
62
|
+
end
|
12
63
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
:error => 0,
|
17
|
-
:error_client => 0,
|
18
|
-
:error_server => 0
|
19
|
-
}
|
20
|
-
#OPTIONS
|
21
|
-
@prgrss = options[:progress] ? options[:progress] : false
|
22
|
-
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
23
|
-
@v= options[:verbose] ? true : false
|
24
|
-
@output=options[:filename] ? options[:filename] : false
|
25
|
-
@fh = options[:fileharvest] ? options[:fileharvest] : false
|
26
|
-
@file_ext = @fh.to_s
|
27
|
-
@s = options[:sitemap] ? options[:sitemap] : false
|
28
|
-
@seo = options[:seo] ? true : false
|
29
|
-
@autodown = options[:autodown] ? true : false
|
30
|
-
#
|
31
|
-
if @fh
|
32
|
-
tempExtStr = "."+@file_ext+'\z'
|
33
|
-
@file_re = Regexp.new(tempExtStr).freeze
|
34
|
-
else
|
35
|
-
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
36
|
-
end
|
37
|
-
if @prgrss
|
38
|
-
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
39
|
-
prgressVars = {
|
40
|
-
:title => "Pages Crawled",
|
41
|
-
:starting_at => 1,
|
42
|
-
:total => @maxPages,
|
43
|
-
:format => '%a |%b>%i| %c/%C %t',
|
44
|
-
}
|
45
|
-
@progressbar = ProgressBar.create(prgressVars)
|
46
|
-
end
|
47
|
-
@t = Retriever::Target.new(url,@file_re)
|
48
|
-
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
49
|
-
@already_crawled.insert(@t.target)
|
50
|
-
if (@fh && !@output)
|
51
|
-
@output = "rr-#{@t.host.split('.')[1]}"
|
52
|
-
end
|
53
|
-
fail "bad page source on target -- try HTTPS?" if !@t.source
|
54
|
-
end
|
55
|
-
def errlog(msg)
|
56
|
-
raise "ERROR: #{msg}"
|
57
|
-
end
|
58
|
-
def lg(msg)
|
59
|
-
puts "### #{msg}" if @v
|
60
|
-
end
|
61
|
-
def dump #prints current data collection to STDOUT, meant for CLI use.
|
62
|
-
puts "###############################"
|
63
|
-
if @v
|
64
|
-
puts "Connection Tally:"
|
65
|
-
puts @connection_tally.to_s
|
66
|
-
puts "###############################"
|
67
|
-
end
|
68
|
-
if @s
|
69
|
-
puts "#{@t.target} Sitemap"
|
70
|
-
puts "Page Count: #{@data.size}"
|
71
|
-
elsif @fh
|
72
|
-
puts "Target URL: #{@t.target}"
|
73
|
-
puts "Filetype: #{@file_ext}"
|
74
|
-
puts "File Count: #{@data.size}"
|
75
|
-
elsif @seo
|
76
|
-
puts "#{@t.target} SEO Metrics"
|
77
|
-
puts "Page Count: #{@data.size}"
|
78
|
-
else
|
79
|
-
fail "ERROR - Cannot dump - Mode Not Found"
|
80
|
-
end
|
81
|
-
puts "###############################"
|
82
|
-
@data.each do |line|
|
83
|
-
puts line
|
84
|
-
end
|
85
|
-
puts "###############################"
|
86
|
-
puts
|
87
|
-
end
|
88
|
-
def write #writes current data collection out to CSV in current directory
|
89
|
-
if @output
|
90
|
-
i = 0
|
91
|
-
CSV.open("#{@output}.csv", "w") do |csv|
|
92
|
-
if ((i == 0) && @seo)
|
93
|
-
csv << ['URL','Page Title','Meta Description','H1','H2']
|
94
|
-
i +=1
|
95
|
-
end
|
96
|
-
@data.each do |entry|
|
97
|
-
csv << entry
|
98
|
-
end
|
99
|
-
end
|
100
|
-
puts "###############################"
|
101
|
-
puts "File Created: #{@output}.csv"
|
102
|
-
puts "Object Count: #{@data.size}"
|
103
|
-
puts "###############################"
|
104
|
-
puts
|
105
|
-
end
|
106
|
-
end
|
107
|
-
def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
|
108
|
-
while (@already_crawled.size < @maxPages)
|
109
|
-
if @linkStack.empty?
|
110
|
-
if @prgrss
|
111
|
-
@progressbar.log("Can't find any more links. Site might be completely mapped.")
|
112
|
-
else
|
113
|
-
lg("Can't find any more links. Site might be completely mapped.")
|
114
|
-
end
|
115
|
-
break;
|
116
|
-
end
|
117
|
-
new_links_arr = self.asyncGetWave()
|
118
|
-
next if (new_links_arr.nil? || new_links_arr.empty?)
|
119
|
-
new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
|
120
|
-
@linkStack.concat(new_links_arr).uniq!
|
121
|
-
@data.concat(new_links_arr) if @s
|
122
|
-
end
|
123
|
-
@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
|
124
|
-
end
|
125
|
-
def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
|
126
|
-
return false if !resp
|
127
|
-
if resp.response_header.redirection? #we got redirected
|
128
|
-
loc = resp.response_header.location
|
129
|
-
lg("#{url} Redirected to #{loc}")
|
130
|
-
if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
|
131
|
-
@linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
|
132
|
-
lg("--Added to linkStack for later")
|
133
|
-
return false
|
134
|
-
end
|
135
|
-
lg("Redirection outside of target host. No - go. #{loc}")
|
136
|
-
return false
|
137
|
-
end
|
138
|
-
if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
|
139
|
-
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
140
|
-
@connection_tally[:error] += 1
|
141
|
-
@connection_tally[:error_server] += 1 if resp.response_header.server_error?
|
142
|
-
@connection_tally[:error_client] += 1 if resp.response_header.client_error?
|
143
|
-
return false
|
144
|
-
end
|
145
|
-
if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
|
146
|
-
@already_crawled.insert(url)
|
147
|
-
@linkStack.delete(url)
|
148
|
-
lg("Page Not text/html -- #{url}")
|
149
|
-
return false
|
150
|
-
end
|
151
|
-
@connection_tally[:success] += 1
|
152
|
-
return true
|
153
|
-
end
|
64
|
+
def errlog(msg)
|
65
|
+
fail "ERROR: #{msg}"
|
66
|
+
end
|
154
67
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
68
|
+
def lg(msg)
|
69
|
+
puts "### #{msg}" if @v
|
70
|
+
end
|
71
|
+
|
72
|
+
# prints current data collection to STDOUT
|
73
|
+
def dump
|
74
|
+
puts '###############################'
|
75
|
+
if @v
|
76
|
+
puts 'Connection Tally:'
|
77
|
+
puts @connection_tally.to_s
|
78
|
+
puts '###############################'
|
79
|
+
end
|
80
|
+
if @s
|
81
|
+
puts "#{@t.target} Sitemap"
|
82
|
+
puts "Page Count: #{@data.size}"
|
83
|
+
elsif @fh
|
84
|
+
puts "Target URL: #{@t.target}"
|
85
|
+
puts "Filetype: #{@file_ext}"
|
86
|
+
puts "File Count: #{@data.size}"
|
87
|
+
elsif @seo
|
88
|
+
puts "#{@t.target} SEO Metrics"
|
89
|
+
puts "Page Count: #{@data.size}"
|
90
|
+
else
|
91
|
+
fail 'ERROR - Cannot dump - Mode Not Found'
|
92
|
+
end
|
93
|
+
puts '###############################'
|
94
|
+
@data.each do |line|
|
95
|
+
puts line
|
96
|
+
end
|
97
|
+
puts '###############################'
|
98
|
+
puts
|
99
|
+
end
|
100
|
+
|
101
|
+
# writes current data collection out to CSV in current directory
|
102
|
+
def write
|
103
|
+
return false unless @output
|
104
|
+
i = 0
|
105
|
+
CSV.open("#{@output}.csv", 'w') do |csv|
|
106
|
+
if (i == 0) && @seo
|
107
|
+
csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
|
108
|
+
i += 1
|
109
|
+
end
|
110
|
+
@data.each do |entry|
|
111
|
+
csv << entry
|
112
|
+
end
|
113
|
+
end
|
114
|
+
puts '###############################'
|
115
|
+
puts "File Created: #{@output}.csv"
|
116
|
+
puts "Object Count: #{@data.size}"
|
117
|
+
puts '###############################'
|
118
|
+
puts
|
119
|
+
end
|
120
|
+
|
121
|
+
# iterates over the existing @link_stack
|
122
|
+
# running until we reach the @max_pages value.
|
123
|
+
def async_crawl_and_collect
|
124
|
+
while @already_crawled.size < @max_pages
|
125
|
+
if @link_stack.empty?
|
126
|
+
if @prgrss
|
127
|
+
@progressbar.log("Can't find any more links.")
|
128
|
+
else
|
129
|
+
lg("Can't find any more links.")
|
130
|
+
end
|
131
|
+
break
|
132
|
+
end
|
133
|
+
new_links_arr = process_link_stack
|
134
|
+
next if new_links_arr.nil? || new_links_arr.empty?
|
135
|
+
# set operations to see are these in our previous visited pages arr
|
136
|
+
new_links_arr -= @link_stack
|
137
|
+
@link_stack.concat(new_links_arr).uniq!
|
138
|
+
@data.concat(new_links_arr) if @s
|
139
|
+
end
|
140
|
+
# done, make sure progress bar says we are done
|
141
|
+
@progressbar.finish if @prgrss
|
142
|
+
end
|
143
|
+
|
144
|
+
# returns true is resp is ok to continue
|
145
|
+
def good_response?(resp, url)
|
146
|
+
return false unless resp
|
147
|
+
hdr = resp.response_header
|
148
|
+
if hdr.redirection?
|
149
|
+
loc = hdr.location
|
150
|
+
lg("#{url} Redirected to #{loc}")
|
151
|
+
if t.host_re =~ loc
|
152
|
+
@link_stack.push(loc) unless @already_crawled.include?(loc)
|
153
|
+
lg('--Added to linkStack for later')
|
154
|
+
return false
|
155
|
+
end
|
156
|
+
lg("Redirection outside of target host. No - go. #{loc}")
|
157
|
+
return false
|
158
|
+
end
|
159
|
+
# lets not continue if unsuccessful connection
|
160
|
+
unless hdr.successful?
|
161
|
+
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
162
|
+
|
163
|
+
@connection_tally[:error] += 1
|
164
|
+
@connection_tally[:error_server] += 1 if hdr.server_error?
|
165
|
+
@connection_tally[:error_client] += 1 if hdr.client_error?
|
166
|
+
return false
|
167
|
+
end
|
168
|
+
# let's not continue if not text/html
|
169
|
+
unless hdr['CONTENT_TYPE'].include?('text/html')
|
170
|
+
@already_crawled.insert(url)
|
171
|
+
@link_stack.delete(url)
|
172
|
+
lg("Page Not text/html -- #{url}")
|
173
|
+
return false
|
174
|
+
end
|
175
|
+
@connection_tally[:success] += 1
|
176
|
+
true
|
177
|
+
end
|
178
|
+
|
179
|
+
# send a new wave of GET requests, using current @link_stack
|
180
|
+
def process_link_stack
|
181
|
+
new_stuff = []
|
182
|
+
EM.synchrony do
|
183
|
+
concurrency = 10
|
184
|
+
EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
|
185
|
+
next if @already_crawled.size >= @max_pages
|
186
|
+
next if @already_crawled.include?(url)
|
187
|
+
|
188
|
+
resp = EventMachine::HttpRequest.new(url).get
|
189
|
+
|
190
|
+
next unless good_response?(resp, url)
|
191
|
+
lg("Page Fetched: #{url}")
|
192
|
+
@already_crawled.insert(url)
|
193
|
+
|
194
|
+
new_page = Retriever::Page.new(resp.response, @t)
|
195
|
+
if @prgrss
|
196
|
+
@progressbar.increment if @already_crawled.size < @max_pages
|
197
|
+
end
|
198
|
+
if @seo
|
199
|
+
seos = [url]
|
200
|
+
seos.concat(new_page.parse_seo)
|
201
|
+
@data.push(seos)
|
202
|
+
lg('--page SEO scraped')
|
203
|
+
end
|
204
|
+
next if new_page.links.size == 0
|
205
|
+
lg("--#{new_page.links.size} links found")
|
206
|
+
internal_links_arr = new_page.parse_internal_visitable
|
207
|
+
new_stuff.push(internal_links_arr)
|
208
|
+
if @fh
|
209
|
+
filez = new_page.parse_files
|
210
|
+
@data.concat(filez) unless filez.empty?
|
211
|
+
lg("--#{filez.size} files found")
|
212
|
+
end
|
213
|
+
end
|
214
|
+
new_stuff = new_stuff.flatten # all completed requests
|
215
|
+
EventMachine.stop
|
216
|
+
end
|
217
|
+
new_stuff.uniq!
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -1,65 +1,70 @@
|
|
1
1
|
module Retriever
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
2
|
+
# recieves target url and RR options
|
3
|
+
# returns an array of all unique files (based on given filetype)
|
4
|
+
# found on the target site
|
5
|
+
class FetchFiles < Fetch
|
6
|
+
def initialize(url, options)
|
7
|
+
super
|
8
|
+
@data = []
|
9
|
+
page_one = Retriever::Page.new(@t.source, @t)
|
10
|
+
@link_stack = page_one.parse_internal_visitable
|
11
|
+
lg("URL Crawled: #{@t.target}")
|
12
|
+
lg("#{@link_stack.size - 1} new links found")
|
10
13
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
temp_file_collection = page_one.parse_files
|
15
|
+
@data.concat(tempFileCollection) if temp_file_collection.size > 0
|
16
|
+
lg("#{@data.size} new files found")
|
17
|
+
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
18
|
+
@link_stack.delete(@t.target)
|
15
19
|
|
16
|
-
|
17
|
-
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
20
|
+
async_crawl_and_collect
|
18
21
|
|
19
|
-
|
22
|
+
@data.sort_by! { |x| x.length }
|
23
|
+
@data.uniq!
|
24
|
+
end
|
20
25
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
end
|
26
|
+
def download_file(path)
|
27
|
+
# given valid url, downloads file to current directory in /rr-downloads/
|
28
|
+
arr = path.split('/')
|
29
|
+
shortname = arr.pop
|
30
|
+
puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
|
31
|
+
File.open(shortname, 'wb') do |saved_file|
|
32
|
+
open(path) do |read_file|
|
33
|
+
saved_file.write(read_file.read)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
puts ' SUCCESS: Download Complete'
|
37
|
+
end
|
38
|
+
|
39
|
+
def autodownload
|
40
|
+
# go through the fetched file URL collection and download each one.
|
41
|
+
lenny = @data.count
|
42
|
+
puts '###################'
|
43
|
+
puts '### Initiating Autodownload...'
|
44
|
+
puts '###################'
|
45
|
+
puts "#{lenny} - #{@file_ext}'s Located"
|
46
|
+
puts '###################'
|
47
|
+
if File.directory?('rr-downloads')
|
48
|
+
Dir.chdir('rr-downloads')
|
49
|
+
else
|
50
|
+
puts 'creating rr-downloads Directory'
|
51
|
+
Dir.mkdir('rr-downloads')
|
52
|
+
Dir.chdir('rr-downloads')
|
53
|
+
end
|
54
|
+
file_counter = 0
|
55
|
+
@data.each do |entry|
|
56
|
+
begin
|
57
|
+
download_file(entry)
|
58
|
+
file_counter += 1
|
59
|
+
lg(' File [#{file_counter} of #{lenny}]')
|
60
|
+
puts
|
61
|
+
rescue StandardError => e
|
62
|
+
puts 'ERROR: failed to download - #{entry}'
|
63
|
+
puts e.message
|
64
|
+
puts
|
65
|
+
end
|
66
|
+
end
|
67
|
+
Dir.chdir('..')
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -1,23 +1,25 @@
|
|
1
1
|
module Retriever
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
2
|
+
#
|
3
|
+
class FetchSEO < Fetch
|
4
|
+
# recieves target url and RR options
|
5
|
+
# returns an array of onpage SEO related fields
|
6
|
+
# on all unique pages found on the site
|
7
|
+
def initialize(url, options)
|
8
|
+
super
|
9
|
+
@data = []
|
10
|
+
page_one = Retriever::Page.new(@t.source, @t)
|
11
|
+
lg("URL Crawled: #{@t.target}")
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
13
|
+
@link_stack = page_one.parse_internal_visitable
|
14
|
+
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
15
|
+
lg("#{@link_stack.size - 1} links found")
|
16
|
+
@link_stack.delete(@t.target)
|
14
17
|
|
15
|
-
|
16
|
-
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
18
|
+
@data.push(page_one.parse_seo)
|
17
19
|
|
18
|
-
|
20
|
+
async_crawl_and_collect
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end
|
22
|
+
@data.sort_by! { |x| x[0].length }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -1,36 +1,41 @@
|
|
1
1
|
module Retriever
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
#
|
3
|
+
class FetchSitemap < Fetch
|
4
|
+
# recieves target URL and RR options
|
5
|
+
# returns an array of all unique pages found on the site
|
6
|
+
def initialize(url, options)
|
7
|
+
super
|
8
|
+
@data = [@t.target]
|
9
|
+
page_one = Retriever::Page.new(@t.source, @t)
|
10
|
+
lg("URL Crawled: #{@t.target}")
|
11
|
+
@link_stack = page_one.parse_internal_visitable
|
12
|
+
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
13
|
+
lg("#{@link_stack.size - 1} links found")
|
11
14
|
|
12
|
-
|
13
|
-
|
14
|
-
@data.concat(@linkStack)
|
15
|
+
@link_stack.delete(@t.target)
|
16
|
+
@data.concat(@link_stack)
|
15
17
|
|
16
|
-
|
18
|
+
async_crawl_and_collect
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
20
|
+
@data.sort_by! { |x| x.length } if @data.size > 1
|
21
|
+
@data.uniq!
|
22
|
+
end
|
23
|
+
|
24
|
+
# produces valid XML sitemap based on page collection fetched.
|
25
|
+
# Writes to current directory.
|
26
|
+
def gen_xml
|
27
|
+
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
28
|
+
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
29
|
+
@data.each do |url|
|
30
|
+
f << "<url><loc>#{url}</loc></url>"
|
31
|
+
end
|
32
|
+
f << '</urlset>'
|
33
|
+
f.close
|
34
|
+
puts '###############################'
|
35
|
+
puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
|
36
|
+
puts "Object Count: #{@data.size}"
|
37
|
+
puts '###############################'
|
38
|
+
puts
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/retriever/link.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
module Retriever
|
2
|
+
#
|
2
3
|
class Link
|
3
4
|
HTTP_RE = Regexp.new(/^http/i).freeze
|
4
5
|
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
@@ -18,12 +19,15 @@ module Retriever
|
|
18
19
|
|
19
20
|
return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
|
20
21
|
|
21
|
-
|
22
|
+
# link begins with '//'
|
23
|
+
return "http:#{link}" if DOUBLE_SLASH_RE =~ link
|
22
24
|
|
23
|
-
|
25
|
+
# link uses relative path with no slashes at all
|
26
|
+
return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
|
24
27
|
end
|
25
28
|
|
26
29
|
private
|
30
|
+
|
27
31
|
attr_reader :host, :link
|
28
32
|
end
|
29
33
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
module OpenURI
|
2
|
-
|
2
|
+
# nesc patch otherwise OPENURI blocks redirects to and from https
|
3
|
+
def OpenURI.redirectable?(uri1, uri2)
|
3
4
|
uri1.scheme.downcase == uri2.scheme.downcase ||
|
4
5
|
(/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
|
5
6
|
end
|
6
|
-
end
|
7
|
+
end
|
data/lib/retriever/page.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
module Retriever
|
2
|
-
|
2
|
+
#
|
3
3
|
class Page
|
4
|
-
|
5
4
|
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
6
5
|
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
|
7
6
|
HTTP_RE = Regexp.new(/^http/i).freeze
|
@@ -14,55 +13,55 @@ module Retriever
|
|
14
13
|
|
15
14
|
attr_reader :links, :source, :t
|
16
15
|
|
17
|
-
def initialize(source,t)
|
16
|
+
def initialize(source, t)
|
18
17
|
@t = t
|
19
18
|
@source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
|
20
19
|
@links = nil
|
21
20
|
end
|
22
21
|
|
23
|
-
#recieves page source as string
|
24
|
-
#returns array of unique href links
|
22
|
+
# recieves page source as string
|
23
|
+
# returns array of unique href links
|
25
24
|
def links
|
26
25
|
return @links if @links
|
27
|
-
return false
|
28
|
-
@links = @source.scan(HREF_CONTENTS_RE).map do |match|
|
26
|
+
return false unless @source
|
27
|
+
@links = @source.scan(HREF_CONTENTS_RE).map do |match|
|
28
|
+
# filter some malformed URLS that come in
|
29
|
+
# meant to be a loose filter to catch all reasonable HREF attributes.
|
29
30
|
link = match[0]
|
30
31
|
Link.new(@t.host, link).path
|
31
32
|
end.uniq
|
32
33
|
end
|
33
34
|
|
34
|
-
def
|
35
|
-
links.select{ |linky| (@t.host_re =~ linky) }
|
35
|
+
def parse_internal
|
36
|
+
links.select { |linky| (@t.host_re =~ linky) }
|
36
37
|
end
|
37
38
|
|
38
|
-
def
|
39
|
-
|
39
|
+
def parse_internal_visitable
|
40
|
+
parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
|
40
41
|
end
|
41
42
|
|
42
|
-
def
|
43
|
-
links.select{ |linky| (@t.file_re =~ linky)}
|
43
|
+
def parse_files
|
44
|
+
links.select { |linky| (@t.file_re =~ linky) }
|
44
45
|
end
|
45
46
|
|
46
47
|
def title
|
47
|
-
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] :
|
48
|
+
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
|
48
49
|
end
|
49
50
|
|
50
51
|
def desc
|
51
|
-
DESC_RE =~ @source ? @source.match(DESC_RE)[1] :
|
52
|
+
DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
|
52
53
|
end
|
53
54
|
|
54
55
|
def h1
|
55
|
-
H1_RE =~ @source ? @source.match(H1_RE)[1] :
|
56
|
+
H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
|
56
57
|
end
|
57
58
|
|
58
59
|
def h2
|
59
|
-
H2_RE =~ @source ? @source.match(H2_RE)[1] :
|
60
|
+
H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
|
60
61
|
end
|
61
62
|
|
62
|
-
def
|
63
|
-
|
63
|
+
def parse_seo
|
64
|
+
[title, desc, h1, h2]
|
64
65
|
end
|
65
|
-
|
66
66
|
end
|
67
|
-
|
68
67
|
end
|
data/lib/retriever/target.rb
CHANGED
@@ -1,52 +1,44 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
|
3
3
|
module Retriever
|
4
|
-
|
4
|
+
#
|
5
5
|
class Target
|
6
|
-
|
7
6
|
HTTP_RE = Regexp.new(/^http/i).freeze
|
8
7
|
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
|
-
|
8
|
+
|
10
9
|
attr_reader :host, :target, :host_re, :source, :file_re
|
11
10
|
|
12
|
-
def initialize(url,file_re=nil)
|
13
|
-
url = "http://#{url}"
|
14
|
-
fail
|
11
|
+
def initialize(url, file_re = nil)
|
12
|
+
url = "http://#{url}" unless HTTP_RE =~ url
|
13
|
+
fail 'Bad URL' unless /\./ =~ url
|
15
14
|
new_uri = URI(url)
|
16
15
|
@target = new_uri.to_s
|
17
16
|
@host = new_uri.host
|
18
|
-
@host_re = Regexp.new(@host.sub('www.',''))
|
17
|
+
@host_re = Regexp.new(@host.sub('www.', ''))
|
19
18
|
@file_re ||= file_re
|
20
19
|
end
|
21
20
|
|
22
21
|
def source
|
23
|
-
resp =
|
24
|
-
begin
|
25
|
-
resp = open(@target)
|
26
|
-
rescue StandardError => e
|
27
|
-
trap("ABRT"){
|
28
|
-
puts "#{@target} failed SSL Certification Verification"
|
29
|
-
}
|
30
|
-
return false
|
31
|
-
end
|
22
|
+
resp = open(@target)
|
32
23
|
resp_url = resp.base_uri.to_s
|
33
|
-
if
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
@host = new_t.host
|
38
|
-
return new_t.source
|
39
|
-
end
|
40
|
-
fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
|
24
|
+
if @target != resp_url
|
25
|
+
fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
|
26
|
+
# if redirect URL is same host, we want to re-sync @target
|
27
|
+
return resync_target_and_return_source(resp_url)
|
41
28
|
end
|
42
29
|
resp = resp.read
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
30
|
+
#
|
31
|
+
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
|
32
|
+
fail 'Domain not working. Try HTTPS???' unless resp
|
33
|
+
# consider using scrub from ruby 2.1? this misses some things
|
34
|
+
resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
|
48
35
|
end
|
49
36
|
|
37
|
+
def resync_target_and_return_source(url)
|
38
|
+
new_t = Retriever::Target.new(url)
|
39
|
+
@target = new_t.target
|
40
|
+
@host = new_t.host
|
41
|
+
new_t.source
|
42
|
+
end
|
50
43
|
end
|
51
|
-
|
52
44
|
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
data/readme.md
CHANGED
@@ -4,13 +4,14 @@
|
|
4
4
|
|
5
5
|
By Joe Norton
|
6
6
|
|
7
|
-
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader
|
7
|
+
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
8
8
|
|
9
|
-
RubyRetriever uses
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
|
10
10
|
|
11
|
-
|
11
|
+
**Use at Own Risk**
|
12
|
+
RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
|
12
13
|
|
13
|
-
v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this
|
14
|
+
**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
|
14
15
|
|
15
16
|
|
16
17
|
getting started
|
data/spec/page_spec.rb
CHANGED
@@ -20,8 +20,8 @@ SOURCE
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
describe "#
|
24
|
-
let (:links){Retriever::Page.new(@source,t).
|
23
|
+
describe "#parse_internal" do
|
24
|
+
let (:links){Retriever::Page.new(@source,t).parse_internal}
|
25
25
|
it "filters links by host" do
|
26
26
|
@source = (<<SOURCE).strip
|
27
27
|
<a href='http://www.cnet.com/'>download</a>
|
@@ -32,8 +32,8 @@ SOURCE
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
-
describe "#
|
36
|
-
let (:links){Retriever::Page.new(@source,t).
|
35
|
+
describe "#parse_internal_visitable" do
|
36
|
+
let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
|
37
37
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
38
38
|
@source = (<<SOURCE).strip
|
39
39
|
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
@@ -43,7 +43,7 @@ SOURCE
|
|
43
43
|
end
|
44
44
|
|
45
45
|
describe "#parseFiles" do
|
46
|
-
let (:links){Retriever::Page.new(@source,t).
|
46
|
+
let (:links){Retriever::Page.new(@source,t).parse_files}
|
47
47
|
it "filters links by filetype" do
|
48
48
|
@source = (<<SOURCE).strip
|
49
49
|
<a href='www.cnet.com/download.exe'>download</a>
|
@@ -90,5 +90,4 @@ SOURCE
|
|
90
90
|
expect(page.h2).to eq(' test 4 ')
|
91
91
|
end
|
92
92
|
end
|
93
|
-
|
94
|
-
end
|
93
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|