sengi 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +71 -0
- data/Makefile +23 -0
- data/Makefile.common +58 -0
- data/README.md +59 -0
- data/Rakefile +25 -0
- data/bin/config +148 -0
- data/bin/crawler +64 -0
- data/bin/list +129 -0
- data/bin/redis_start +11 -0
- data/bin/redis_stats +13 -0
- data/bin/redis_stop +10 -0
- data/bin/resque_crawler_restart +14 -0
- data/bin/resque_crawler_start +21 -0
- data/bin/resque_crawler_stop +20 -0
- data/bin/resque_scheduler_start +15 -0
- data/bin/resque_scheduler_stop +16 -0
- data/bin/resque_server_start +13 -0
- data/bin/resque_server_stop +13 -0
- data/config/redis.conf +120 -0
- data/config/resque_server_config.rb +6 -0
- data/lib/sengi.rb +5 -0
- data/lib/sengi/crawler.rb +589 -0
- data/lib/sengi/crawler_worker.rb +16 -0
- data/lib/sengi/uri.rb +288 -0
- data/lib/sengi/version.rb +17 -0
- data/sengi.gemspec +37 -0
- data/sengi.sublime-project +10 -0
- data/tests/tc_crawler.rb +14 -0
- data/tests/tc_uri.rb +140 -0
- data/tests/ts_all.rb +4 -0
- metadata +202 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 027ebc7d9b5e1a8f6d3bb4b5c434beb27a152c24
|
4
|
+
data.tar.gz: c79a8f8dd0fdf168071b0ce93049f93e9fcb9422
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3799cf4daca51694b6f375d7ed68db4111d21c95176860845a5970e25ee8d3d056250a45f630776d4922b9d835b93f4134fa7facc2c5f5a4809a4cca568f022a
|
7
|
+
data.tar.gz: 9fabd1047750167bb3e7daaa7638fcd013a3fb04ede390b8b845d7743190a97bc81819f0af59938da8d92ef4d7bb5e501f1c5642b5361d5a60da0d55fd183b31
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sengi (0.1.0.pre.dev.6)
|
5
|
+
activesupport (~> 4.2)
|
6
|
+
cookiejar (~> 0.3)
|
7
|
+
hiredis (~> 0.6)
|
8
|
+
nokogiri (~> 1.6)
|
9
|
+
redis (~> 3.2)
|
10
|
+
resque (~> 1.26)
|
11
|
+
resque-scheduler (~> 4.1)
|
12
|
+
thefox-ext (~> 1.4)
|
13
|
+
|
14
|
+
GEM
|
15
|
+
remote: https://rubygems.org/
|
16
|
+
specs:
|
17
|
+
activesupport (4.2.6)
|
18
|
+
i18n (~> 0.7)
|
19
|
+
json (~> 1.7, >= 1.7.7)
|
20
|
+
minitest (~> 5.1)
|
21
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
22
|
+
tzinfo (~> 1.1)
|
23
|
+
cookiejar (0.3.3)
|
24
|
+
hiredis (0.6.1)
|
25
|
+
i18n (0.7.0)
|
26
|
+
json (1.8.3)
|
27
|
+
mini_portile2 (2.0.0)
|
28
|
+
minitest (5.8.4)
|
29
|
+
mono_logger (1.1.0)
|
30
|
+
multi_json (1.11.2)
|
31
|
+
nokogiri (1.6.7.2)
|
32
|
+
mini_portile2 (~> 2.0.0.rc2)
|
33
|
+
rack (1.6.4)
|
34
|
+
rack-protection (1.5.3)
|
35
|
+
rack
|
36
|
+
redis (3.3.0)
|
37
|
+
redis-namespace (1.5.2)
|
38
|
+
redis (~> 3.0, >= 3.0.4)
|
39
|
+
resque (1.26.0)
|
40
|
+
mono_logger (~> 1.0)
|
41
|
+
multi_json (~> 1.0)
|
42
|
+
redis-namespace (~> 1.3)
|
43
|
+
sinatra (>= 0.9.2)
|
44
|
+
vegas (~> 0.1.2)
|
45
|
+
resque-scheduler (4.1.0)
|
46
|
+
mono_logger (~> 1.0)
|
47
|
+
redis (~> 3.0)
|
48
|
+
resque (~> 1.25)
|
49
|
+
rufus-scheduler (~> 3.0)
|
50
|
+
rufus-scheduler (3.2.0)
|
51
|
+
sinatra (1.4.7)
|
52
|
+
rack (~> 1.5)
|
53
|
+
rack-protection (~> 1.4)
|
54
|
+
tilt (>= 1.3, < 3)
|
55
|
+
thefox-ext (1.4.1)
|
56
|
+
thread_safe (0.3.5)
|
57
|
+
tilt (2.0.2)
|
58
|
+
tzinfo (1.2.2)
|
59
|
+
thread_safe (~> 0.1)
|
60
|
+
vegas (0.1.11)
|
61
|
+
rack (>= 1.0.0)
|
62
|
+
|
63
|
+
PLATFORMS
|
64
|
+
ruby
|
65
|
+
|
66
|
+
DEPENDENCIES
|
67
|
+
minitest (~> 5.8)
|
68
|
+
sengi!
|
69
|
+
|
70
|
+
BUNDLED WITH
|
71
|
+
1.11.2
|
data/Makefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
GEM_NAME = sengi
|
3
|
+
ALL_TARGETS_EXT = tmp run init
|
4
|
+
|
5
|
+
include Makefile.common
|
6
|
+
|
7
|
+
dev:
|
8
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://dev.fox21.at/sengi/
|
9
|
+
|
10
|
+
run:
|
11
|
+
$(MKDIR) $@
|
12
|
+
|
13
|
+
.PHONY: reset
|
14
|
+
reset:
|
15
|
+
RUBYOPT=-rbundler/setup ruby ./bin/config --reset
|
16
|
+
|
17
|
+
.PHONY: init
|
18
|
+
init:
|
19
|
+
RUBYOPT=-rbundler/setup ruby ./bin/config --init
|
20
|
+
|
21
|
+
.PHONY: test
|
22
|
+
test:
|
23
|
+
RUBYOPT=-w $(BUNDLER) exec ./tests/ts_all.rb
|
data/Makefile.common
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
|
2
|
+
# Ruby Common Big
|
3
|
+
# 2016-04-09
|
4
|
+
|
5
|
+
MV = mv -nv
|
6
|
+
RM = rm -rf
|
7
|
+
MKDIR = mkdir -p
|
8
|
+
CHMOD = chmod
|
9
|
+
BUNDLER = bundle
|
10
|
+
BUNDLER_OPTIONS = --jobs=5 --retry=3
|
11
|
+
GEMSPEC_FILE = $(GEM_NAME).gemspec
|
12
|
+
|
13
|
+
.PHONY: all
|
14
|
+
all: setup $(ALL_TARGETS_EXT)
|
15
|
+
|
16
|
+
.PHONY: setup
|
17
|
+
setup: .setup
|
18
|
+
|
19
|
+
.setup:
|
20
|
+
$(BUNDLER) install $(BUNDLER_OPTIONS)
|
21
|
+
touch $@
|
22
|
+
|
23
|
+
.PHONY: install
|
24
|
+
install:
|
25
|
+
gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
|
26
|
+
sudo gem install $$gem_file; \
|
27
|
+
$(RM) $$gem_file
|
28
|
+
|
29
|
+
.PHONY: uninstall
|
30
|
+
uninstall:
|
31
|
+
sudo gem uninstall $(GEM_NAME)
|
32
|
+
|
33
|
+
.PHONY: update
|
34
|
+
update:
|
35
|
+
$(BUNDLER) update
|
36
|
+
|
37
|
+
.PHONY: clean
|
38
|
+
clean:
|
39
|
+
$(RM) .bundle
|
40
|
+
$(RM) .setup
|
41
|
+
$(RM) Gemfile.lock
|
42
|
+
|
43
|
+
.PHONY: release
|
44
|
+
release: | releases
|
45
|
+
set -e; \
|
46
|
+
gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
|
47
|
+
dst="releases/$$gem_file"; \
|
48
|
+
[ ! -f $$dst ]; \
|
49
|
+
$(MV) $$gem_file releases; \
|
50
|
+
gem push $$dst; \
|
51
|
+
echo 'done'
|
52
|
+
|
53
|
+
releases:
|
54
|
+
$(MKDIR) $@
|
55
|
+
|
56
|
+
tmp:
|
57
|
+
$(MKDIR) $@
|
58
|
+
$(CHMOD) u=rwx,go-rwx $@
|
data/README.md
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# Sengi Web Crawler
|
2
|
+
|
3
|
+
A web crawler using Ruby and Redis.
|
4
|
+
|
5
|
+
## Install
|
6
|
+
|
7
|
+
First, run:
|
8
|
+
|
9
|
+
gem install rake bundler nokogiri hiredis
|
10
|
+
make
|
11
|
+
|
12
|
+
## Setup
|
13
|
+
|
14
|
+
[Redis](http://redis.io/) is used to store everything. So it's always be needed to run Sengi.
|
15
|
+
|
16
|
+
Start Redis:
|
17
|
+
|
18
|
+
./bin/redis
|
19
|
+
|
20
|
+
Start [Resque](https://github.com/resque/resque) -- Scheduler and Worker:
|
21
|
+
|
22
|
+
./bin/resque_scheduler_start
|
23
|
+
./bin/resque_crawler_start
|
24
|
+
|
25
|
+
To get a Resque web dashboard at <http://localhost:8282>, run:
|
26
|
+
|
27
|
+
./bin/resque_server
|
28
|
+
|
29
|
+
Init Sengi. This sets default variables to Redis and a blacklist of the deepweb.
|
30
|
+
|
31
|
+
RUBYOPT=-rbundler/setup ruby ./bin/config --init
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
### Queue
|
36
|
+
|
37
|
+
To queue a URL to be crawled, run:
|
38
|
+
|
39
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://example.com
|
40
|
+
|
41
|
+
### Relative Links Only
|
42
|
+
|
43
|
+
To crawl only relative links on `example.com`:
|
44
|
+
|
45
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -r http://example.com
|
46
|
+
|
47
|
+
### Serial
|
48
|
+
|
49
|
+
Crawl only one URL at a time. The latest datetime will be stored into Redis key `urls:schedule:last`. A new URL to crawl will be scheduled for a new datetime calculated by `urls:schedule:last + url_delay`. Where `url_delay` is the number of seconds between the scheduled URLs.
|
50
|
+
|
51
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -s http://example.com
|
52
|
+
|
53
|
+
## License
|
54
|
+
|
55
|
+
Copyright (C) 2016 Christian Mayer <http://fox21.at>
|
56
|
+
|
57
|
+
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
58
|
+
|
59
|
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
require 'resque'
|
6
|
+
require 'resque/tasks'
|
7
|
+
require 'resque-scheduler'
|
8
|
+
require 'resque/scheduler/tasks'
|
9
|
+
|
10
|
+
require 'sengi'
|
11
|
+
|
12
|
+
namespace :resque do
|
13
|
+
task :setup do
|
14
|
+
puts 'resque setup'
|
15
|
+
#require 'resque'
|
16
|
+
Resque.redis = '127.0.0.1:7000'
|
17
|
+
end
|
18
|
+
|
19
|
+
task :setup_schedule => :setup do
|
20
|
+
puts 'schedule setup'
|
21
|
+
#require 'resque-scheduler'
|
22
|
+
end
|
23
|
+
|
24
|
+
task :scheduler => :setup_schedule
|
25
|
+
end
|
data/bin/config
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: UTF-8
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'hiredis'
|
6
|
+
require 'sengi'
|
7
|
+
|
8
|
+
|
9
|
+
@redis = Hiredis::Connection.new
|
10
|
+
@redis.connect('127.0.0.1', 7000)
|
11
|
+
@redis.write(['SELECT', 1])
|
12
|
+
@redis.read
|
13
|
+
|
14
|
+
def reset
|
15
|
+
(0..1).each do |n|
|
16
|
+
@redis.write(['SELECT', n])
|
17
|
+
@redis.read
|
18
|
+
@redis.write(['FLUSHDB'])
|
19
|
+
puts "FLUSH DB #{n}: #{@redis.read}"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def init
|
24
|
+
@redis.write(['SET', 'urls:schedule:lock', 0])
|
25
|
+
puts "urls schedule lock: #{@redis.read}"
|
26
|
+
|
27
|
+
@redis.write(['SET', 'urls:schedule:last', (Time.now + 180).strftime('%F %T %z')])
|
28
|
+
puts "urls schedule last: #{@redis.read}"
|
29
|
+
|
30
|
+
@redis.write(['SET', 'urls:delay', TheFox::Sengi::URL_DELAY])
|
31
|
+
puts "urls delay: #{@redis.read}"
|
32
|
+
|
33
|
+
@redis.write(['SET', 'urls:separatedelay', TheFox::Sengi::URL_SEPARATE_DELAY])
|
34
|
+
puts "urls separatedelay: #{@redis.read}"
|
35
|
+
|
36
|
+
@redis.write(['SET', 'urls:reschedule', TheFox::Sengi::URL_RESCHEDULE])
|
37
|
+
puts "urls reschedule: #{@redis.read}"
|
38
|
+
|
39
|
+
@redis.write(['SADD', 'domains:ignore',
|
40
|
+
'4chan.org',
|
41
|
+
'about.me',
|
42
|
+
'amazon',
|
43
|
+
'ask.fm',
|
44
|
+
'bitbucket.org',
|
45
|
+
'bit.ly', 'bitly.com',
|
46
|
+
'bbc.com',
|
47
|
+
'blockchain.info',
|
48
|
+
'blogger.com',
|
49
|
+
'blogspot',
|
50
|
+
'cnet.com',
|
51
|
+
'cnn.com',
|
52
|
+
'delicious.com',
|
53
|
+
'digg.com',
|
54
|
+
'disqus.com',
|
55
|
+
'doodle.com',
|
56
|
+
'dropbox.com',
|
57
|
+
'droplr.com',
|
58
|
+
'duckduckgo.com',
|
59
|
+
'ebay.com',
|
60
|
+
'facebook.com', 'fb.com', 'fb.me',
|
61
|
+
'flickr.com',
|
62
|
+
'getpocket.com',
|
63
|
+
'github.com',
|
64
|
+
'google',
|
65
|
+
'gravatar.com',
|
66
|
+
'imdb.com',
|
67
|
+
'imgur.com',
|
68
|
+
'instagram.com',
|
69
|
+
'jsbin.com',
|
70
|
+
'jsfiddle.net',
|
71
|
+
'keybase.io',
|
72
|
+
'kickstarter.com',
|
73
|
+
'linkedin.com',
|
74
|
+
'localhost',
|
75
|
+
'myspace.com',
|
76
|
+
'npmjs.com',
|
77
|
+
'openstreetmap.org', 'osm.org',
|
78
|
+
'packagist.org',
|
79
|
+
'pastebin.com',
|
80
|
+
'paypal.com',
|
81
|
+
'reddit.com',
|
82
|
+
'skype.com',
|
83
|
+
'slack.com',
|
84
|
+
'slashdot.org',
|
85
|
+
'soundcloud.com',
|
86
|
+
'thepiratebay',
|
87
|
+
'tumblr.com',
|
88
|
+
'twitpic.com',
|
89
|
+
'twitter.com',
|
90
|
+
'vimeo.com',
|
91
|
+
'wikipedia.org',
|
92
|
+
'willhaben.at',
|
93
|
+
'ycombinator.com',
|
94
|
+
'xing.com',
|
95
|
+
'yahoo.com',
|
96
|
+
'youtube',
|
97
|
+
])
|
98
|
+
puts "domains ignore: #{@redis.read}"
|
99
|
+
end
|
100
|
+
|
101
|
+
@options = {
|
102
|
+
}
|
103
|
+
opts = OptionParser.new do |o|
|
104
|
+
o.banner = 'Usage: config [options] <command>'
|
105
|
+
o.separator('')
|
106
|
+
|
107
|
+
o.on_tail('--reset', 'Reset all.') do
|
108
|
+
reset
|
109
|
+
exit
|
110
|
+
end
|
111
|
+
|
112
|
+
o.on_tail('--init', 'Set up all initial keys.') do
|
113
|
+
init
|
114
|
+
exit
|
115
|
+
end
|
116
|
+
|
117
|
+
o.on_tail('--reinit', 'Same as --reset and --init.') do
|
118
|
+
reset
|
119
|
+
init
|
120
|
+
exit
|
121
|
+
end
|
122
|
+
|
123
|
+
o.on_tail('-h', '--help', 'Show this message.') do
|
124
|
+
puts o
|
125
|
+
puts
|
126
|
+
puts 'Commands'
|
127
|
+
puts ' domain'
|
128
|
+
exit 3
|
129
|
+
end
|
130
|
+
end
|
131
|
+
ARGV << '-h' if ARGV.count == 0
|
132
|
+
commands = opts.parse(ARGV)
|
133
|
+
command = commands.shift
|
134
|
+
|
135
|
+
if command == 'domain'
|
136
|
+
subcommand = commands.shift
|
137
|
+
|
138
|
+
if subcommand == 'ignore'
|
139
|
+
subsubcommand = commands.shift
|
140
|
+
|
141
|
+
if subsubcommand == 'add'
|
142
|
+
commands.each do |domain|
|
143
|
+
@redis.write(['SADD', 'domains:ignore', domain.downcase])
|
144
|
+
puts @redis.read
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
data/bin/crawler
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: UTF-8
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'resque'
|
6
|
+
require 'resque-scheduler'
|
7
|
+
require 'time'
|
8
|
+
require 'sengi'
|
9
|
+
|
10
|
+
|
11
|
+
@options = {
|
12
|
+
'queue' => false,
|
13
|
+
'serial' => false,
|
14
|
+
'relative' => false,
|
15
|
+
'debug' => false,
|
16
|
+
}
|
17
|
+
opts = OptionParser.new do |o|
|
18
|
+
o.banner = 'Usage: crawler [options] <url...>'
|
19
|
+
o.separator('')
|
20
|
+
|
21
|
+
o.on('-q', '--queue', 'Enqueue a URL.') do
|
22
|
+
@options['queue'] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
o.on('-s', '--serial', 'Schedule the URLs serial.') do
|
26
|
+
# Set this option to true to schedule the URLs serial.
|
27
|
+
# The Redis key 'urls:schedule:last' will be used to store the last
|
28
|
+
# used schedule time and URL_DELAY will be added to create a new
|
29
|
+
# schedule time for the new URL.
|
30
|
+
# Otherwise if this option isn't used URL_DELAY will be added
|
31
|
+
# to the current time.
|
32
|
+
@options['serial'] = true
|
33
|
+
end
|
34
|
+
|
35
|
+
o.on('-r', '--relative', 'Follow only relative links.') do
|
36
|
+
# And also URLs with the same host.
|
37
|
+
@options['relative'] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
o.on('-f', '--force', 'Force a URL to be requested.') do
|
41
|
+
@options['force'] = true
|
42
|
+
end
|
43
|
+
|
44
|
+
o.on('-d', 'Debug') do
|
45
|
+
@options['debug'] = true
|
46
|
+
end
|
47
|
+
|
48
|
+
o.on_tail('-h', '--help', 'Show this message.') do
|
49
|
+
puts o
|
50
|
+
puts
|
51
|
+
exit 3
|
52
|
+
end
|
53
|
+
end
|
54
|
+
ARGV << '-h' if ARGV.count == 0
|
55
|
+
urls = opts.parse(ARGV)
|
56
|
+
|
57
|
+
Resque.redis = '127.0.0.1:7000'
|
58
|
+
urls.each_with_index do |url, index|
|
59
|
+
if @options['queue']
|
60
|
+
Resque.enqueue(TheFox::Sengi::CrawlerWorker, url, @options)
|
61
|
+
else
|
62
|
+
TheFox::Sengi::CrawlerWorker.perform(url, @options)
|
63
|
+
end
|
64
|
+
end
|