sengi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +71 -0
- data/Makefile +23 -0
- data/Makefile.common +58 -0
- data/README.md +59 -0
- data/Rakefile +25 -0
- data/bin/config +148 -0
- data/bin/crawler +64 -0
- data/bin/list +129 -0
- data/bin/redis_start +11 -0
- data/bin/redis_stats +13 -0
- data/bin/redis_stop +10 -0
- data/bin/resque_crawler_restart +14 -0
- data/bin/resque_crawler_start +21 -0
- data/bin/resque_crawler_stop +20 -0
- data/bin/resque_scheduler_start +15 -0
- data/bin/resque_scheduler_stop +16 -0
- data/bin/resque_server_start +13 -0
- data/bin/resque_server_stop +13 -0
- data/config/redis.conf +120 -0
- data/config/resque_server_config.rb +6 -0
- data/lib/sengi.rb +5 -0
- data/lib/sengi/crawler.rb +589 -0
- data/lib/sengi/crawler_worker.rb +16 -0
- data/lib/sengi/uri.rb +288 -0
- data/lib/sengi/version.rb +17 -0
- data/sengi.gemspec +37 -0
- data/sengi.sublime-project +10 -0
- data/tests/tc_crawler.rb +14 -0
- data/tests/tc_uri.rb +140 -0
- data/tests/ts_all.rb +4 -0
- metadata +202 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 027ebc7d9b5e1a8f6d3bb4b5c434beb27a152c24
|
4
|
+
data.tar.gz: c79a8f8dd0fdf168071b0ce93049f93e9fcb9422
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3799cf4daca51694b6f375d7ed68db4111d21c95176860845a5970e25ee8d3d056250a45f630776d4922b9d835b93f4134fa7facc2c5f5a4809a4cca568f022a
|
7
|
+
data.tar.gz: 9fabd1047750167bb3e7daaa7638fcd013a3fb04ede390b8b845d7743190a97bc81819f0af59938da8d92ef4d7bb5e501f1c5642b5361d5a60da0d55fd183b31
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sengi (0.1.0.pre.dev.6)
|
5
|
+
activesupport (~> 4.2)
|
6
|
+
cookiejar (~> 0.3)
|
7
|
+
hiredis (~> 0.6)
|
8
|
+
nokogiri (~> 1.6)
|
9
|
+
redis (~> 3.2)
|
10
|
+
resque (~> 1.26)
|
11
|
+
resque-scheduler (~> 4.1)
|
12
|
+
thefox-ext (~> 1.4)
|
13
|
+
|
14
|
+
GEM
|
15
|
+
remote: https://rubygems.org/
|
16
|
+
specs:
|
17
|
+
activesupport (4.2.6)
|
18
|
+
i18n (~> 0.7)
|
19
|
+
json (~> 1.7, >= 1.7.7)
|
20
|
+
minitest (~> 5.1)
|
21
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
22
|
+
tzinfo (~> 1.1)
|
23
|
+
cookiejar (0.3.3)
|
24
|
+
hiredis (0.6.1)
|
25
|
+
i18n (0.7.0)
|
26
|
+
json (1.8.3)
|
27
|
+
mini_portile2 (2.0.0)
|
28
|
+
minitest (5.8.4)
|
29
|
+
mono_logger (1.1.0)
|
30
|
+
multi_json (1.11.2)
|
31
|
+
nokogiri (1.6.7.2)
|
32
|
+
mini_portile2 (~> 2.0.0.rc2)
|
33
|
+
rack (1.6.4)
|
34
|
+
rack-protection (1.5.3)
|
35
|
+
rack
|
36
|
+
redis (3.3.0)
|
37
|
+
redis-namespace (1.5.2)
|
38
|
+
redis (~> 3.0, >= 3.0.4)
|
39
|
+
resque (1.26.0)
|
40
|
+
mono_logger (~> 1.0)
|
41
|
+
multi_json (~> 1.0)
|
42
|
+
redis-namespace (~> 1.3)
|
43
|
+
sinatra (>= 0.9.2)
|
44
|
+
vegas (~> 0.1.2)
|
45
|
+
resque-scheduler (4.1.0)
|
46
|
+
mono_logger (~> 1.0)
|
47
|
+
redis (~> 3.0)
|
48
|
+
resque (~> 1.25)
|
49
|
+
rufus-scheduler (~> 3.0)
|
50
|
+
rufus-scheduler (3.2.0)
|
51
|
+
sinatra (1.4.7)
|
52
|
+
rack (~> 1.5)
|
53
|
+
rack-protection (~> 1.4)
|
54
|
+
tilt (>= 1.3, < 3)
|
55
|
+
thefox-ext (1.4.1)
|
56
|
+
thread_safe (0.3.5)
|
57
|
+
tilt (2.0.2)
|
58
|
+
tzinfo (1.2.2)
|
59
|
+
thread_safe (~> 0.1)
|
60
|
+
vegas (0.1.11)
|
61
|
+
rack (>= 1.0.0)
|
62
|
+
|
63
|
+
PLATFORMS
|
64
|
+
ruby
|
65
|
+
|
66
|
+
DEPENDENCIES
|
67
|
+
minitest (~> 5.8)
|
68
|
+
sengi!
|
69
|
+
|
70
|
+
BUNDLED WITH
|
71
|
+
1.11.2
|
data/Makefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
GEM_NAME = sengi
|
3
|
+
ALL_TARGETS_EXT = tmp run init
|
4
|
+
|
5
|
+
include Makefile.common
|
6
|
+
|
7
|
+
dev:
|
8
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://dev.fox21.at/sengi/
|
9
|
+
|
10
|
+
run:
|
11
|
+
$(MKDIR) $@
|
12
|
+
|
13
|
+
.PHONY: reset
|
14
|
+
reset:
|
15
|
+
RUBYOPT=-rbundler/setup ruby ./bin/config --reset
|
16
|
+
|
17
|
+
.PHONY: init
|
18
|
+
init:
|
19
|
+
RUBYOPT=-rbundler/setup ruby ./bin/config --init
|
20
|
+
|
21
|
+
.PHONY: test
|
22
|
+
test:
|
23
|
+
RUBYOPT=-w $(BUNDLER) exec ./tests/ts_all.rb
|
data/Makefile.common
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
|
2
|
+
# Ruby Common Big
|
3
|
+
# 2016-04-09
|
4
|
+
|
5
|
+
MV = mv -nv
|
6
|
+
RM = rm -rf
|
7
|
+
MKDIR = mkdir -p
|
8
|
+
CHMOD = chmod
|
9
|
+
BUNDLER = bundle
|
10
|
+
BUNDLER_OPTIONS = --jobs=5 --retry=3
|
11
|
+
GEMSPEC_FILE = $(GEM_NAME).gemspec
|
12
|
+
|
13
|
+
.PHONY: all
|
14
|
+
all: setup $(ALL_TARGETS_EXT)
|
15
|
+
|
16
|
+
.PHONY: setup
|
17
|
+
setup: .setup
|
18
|
+
|
19
|
+
.setup:
|
20
|
+
$(BUNDLER) install $(BUNDLER_OPTIONS)
|
21
|
+
touch $@
|
22
|
+
|
23
|
+
.PHONY: install
|
24
|
+
install:
|
25
|
+
gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
|
26
|
+
sudo gem install $$gem_file; \
|
27
|
+
$(RM) $$gem_file
|
28
|
+
|
29
|
+
.PHONY: uninstall
|
30
|
+
uninstall:
|
31
|
+
sudo gem uninstall $(GEM_NAME)
|
32
|
+
|
33
|
+
.PHONY: update
|
34
|
+
update:
|
35
|
+
$(BUNDLER) update
|
36
|
+
|
37
|
+
.PHONY: clean
|
38
|
+
clean:
|
39
|
+
$(RM) .bundle
|
40
|
+
$(RM) .setup
|
41
|
+
$(RM) Gemfile.lock
|
42
|
+
|
43
|
+
.PHONY: release
|
44
|
+
release: | releases
|
45
|
+
set -e; \
|
46
|
+
gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
|
47
|
+
dst="releases/$$gem_file"; \
|
48
|
+
[ ! -f $$dst ]; \
|
49
|
+
$(MV) $$gem_file releases; \
|
50
|
+
gem push $$dst; \
|
51
|
+
echo 'done'
|
52
|
+
|
53
|
+
releases:
|
54
|
+
$(MKDIR) $@
|
55
|
+
|
56
|
+
tmp:
|
57
|
+
$(MKDIR) $@
|
58
|
+
$(CHMOD) u=rwx,go-rwx $@
|
data/README.md
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# Sengi Web Crawler
|
2
|
+
|
3
|
+
A web crawler using Ruby and Redis.
|
4
|
+
|
5
|
+
## Install
|
6
|
+
|
7
|
+
First, run:
|
8
|
+
|
9
|
+
gem install rake bundler nokogiri hiredis
|
10
|
+
make
|
11
|
+
|
12
|
+
## Setup
|
13
|
+
|
14
|
+
[Redis](http://redis.io/) is used to store everything. So it's always be needed to run Sengi.
|
15
|
+
|
16
|
+
Start Redis:
|
17
|
+
|
18
|
+
./bin/redis
|
19
|
+
|
20
|
+
Start [Resque](https://github.com/resque/resque) -- Scheduler and Worker:
|
21
|
+
|
22
|
+
./bin/resque_scheduler_start
|
23
|
+
./bin/resque_crawler_start
|
24
|
+
|
25
|
+
To get a Resque web dashboard at <http://localhost:8282>, run:
|
26
|
+
|
27
|
+
./bin/resque_server
|
28
|
+
|
29
|
+
Init Sengi. This sets default variables to Redis and a blacklist of the deepweb.
|
30
|
+
|
31
|
+
RUBYOPT=-rbundler/setup ruby ./bin/config --init
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
### Queue
|
36
|
+
|
37
|
+
To queue a URL to be crawled, run:
|
38
|
+
|
39
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://example.com
|
40
|
+
|
41
|
+
### Relative Links Only
|
42
|
+
|
43
|
+
To crawl only relative links on `example.com`:
|
44
|
+
|
45
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -r http://example.com
|
46
|
+
|
47
|
+
### Serial
|
48
|
+
|
49
|
+
Crawl only one URL at a time. The latest datetime will be stored into Redis key `urls:schedule:last`. A new URL to crawl will be scheduled for a new datetime calculated by `urls:schedule:last + url_delay`. Where `url_delay` is the number of seconds between the scheduled URLs.
|
50
|
+
|
51
|
+
RUBYOPT=-rbundler/setup ruby ./bin/crawler -s http://example.com
|
52
|
+
|
53
|
+
## License
|
54
|
+
|
55
|
+
Copyright (C) 2016 Christian Mayer <http://fox21.at>
|
56
|
+
|
57
|
+
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
58
|
+
|
59
|
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
require 'resque'
|
6
|
+
require 'resque/tasks'
|
7
|
+
require 'resque-scheduler'
|
8
|
+
require 'resque/scheduler/tasks'
|
9
|
+
|
10
|
+
require 'sengi'
|
11
|
+
|
12
|
+
namespace :resque do
|
13
|
+
task :setup do
|
14
|
+
puts 'resque setup'
|
15
|
+
#require 'resque'
|
16
|
+
Resque.redis = '127.0.0.1:7000'
|
17
|
+
end
|
18
|
+
|
19
|
+
task :setup_schedule => :setup do
|
20
|
+
puts 'schedule setup'
|
21
|
+
#require 'resque-scheduler'
|
22
|
+
end
|
23
|
+
|
24
|
+
task :scheduler => :setup_schedule
|
25
|
+
end
|
data/bin/config
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: UTF-8
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'hiredis'
|
6
|
+
require 'sengi'
|
7
|
+
|
8
|
+
|
9
|
+
@redis = Hiredis::Connection.new
|
10
|
+
@redis.connect('127.0.0.1', 7000)
|
11
|
+
@redis.write(['SELECT', 1])
|
12
|
+
@redis.read
|
13
|
+
|
14
|
+
def reset
|
15
|
+
(0..1).each do |n|
|
16
|
+
@redis.write(['SELECT', n])
|
17
|
+
@redis.read
|
18
|
+
@redis.write(['FLUSHDB'])
|
19
|
+
puts "FLUSH DB #{n}: #{@redis.read}"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def init
|
24
|
+
@redis.write(['SET', 'urls:schedule:lock', 0])
|
25
|
+
puts "urls schedule lock: #{@redis.read}"
|
26
|
+
|
27
|
+
@redis.write(['SET', 'urls:schedule:last', (Time.now + 180).strftime('%F %T %z')])
|
28
|
+
puts "urls schedule last: #{@redis.read}"
|
29
|
+
|
30
|
+
@redis.write(['SET', 'urls:delay', TheFox::Sengi::URL_DELAY])
|
31
|
+
puts "urls delay: #{@redis.read}"
|
32
|
+
|
33
|
+
@redis.write(['SET', 'urls:separatedelay', TheFox::Sengi::URL_SEPARATE_DELAY])
|
34
|
+
puts "urls separatedelay: #{@redis.read}"
|
35
|
+
|
36
|
+
@redis.write(['SET', 'urls:reschedule', TheFox::Sengi::URL_RESCHEDULE])
|
37
|
+
puts "urls reschedule: #{@redis.read}"
|
38
|
+
|
39
|
+
@redis.write(['SADD', 'domains:ignore',
|
40
|
+
'4chan.org',
|
41
|
+
'about.me',
|
42
|
+
'amazon',
|
43
|
+
'ask.fm',
|
44
|
+
'bitbucket.org',
|
45
|
+
'bit.ly', 'bitly.com',
|
46
|
+
'bbc.com',
|
47
|
+
'blockchain.info',
|
48
|
+
'blogger.com',
|
49
|
+
'blogspot',
|
50
|
+
'cnet.com',
|
51
|
+
'cnn.com',
|
52
|
+
'delicious.com',
|
53
|
+
'digg.com',
|
54
|
+
'disqus.com',
|
55
|
+
'doodle.com',
|
56
|
+
'dropbox.com',
|
57
|
+
'droplr.com',
|
58
|
+
'duckduckgo.com',
|
59
|
+
'ebay.com',
|
60
|
+
'facebook.com', 'fb.com', 'fb.me',
|
61
|
+
'flickr.com',
|
62
|
+
'getpocket.com',
|
63
|
+
'github.com',
|
64
|
+
'google',
|
65
|
+
'gravatar.com',
|
66
|
+
'imdb.com',
|
67
|
+
'imgur.com',
|
68
|
+
'instagram.com',
|
69
|
+
'jsbin.com',
|
70
|
+
'jsfiddle.net',
|
71
|
+
'keybase.io',
|
72
|
+
'kickstarter.com',
|
73
|
+
'linkedin.com',
|
74
|
+
'localhost',
|
75
|
+
'myspace.com',
|
76
|
+
'npmjs.com',
|
77
|
+
'openstreetmap.org', 'osm.org',
|
78
|
+
'packagist.org',
|
79
|
+
'pastebin.com',
|
80
|
+
'paypal.com',
|
81
|
+
'reddit.com',
|
82
|
+
'skype.com',
|
83
|
+
'slack.com',
|
84
|
+
'slashdot.org',
|
85
|
+
'soundcloud.com',
|
86
|
+
'thepiratebay',
|
87
|
+
'tumblr.com',
|
88
|
+
'twitpic.com',
|
89
|
+
'twitter.com',
|
90
|
+
'vimeo.com',
|
91
|
+
'wikipedia.org',
|
92
|
+
'willhaben.at',
|
93
|
+
'ycombinator.com',
|
94
|
+
'xing.com',
|
95
|
+
'yahoo.com',
|
96
|
+
'youtube',
|
97
|
+
])
|
98
|
+
puts "domains ignore: #{@redis.read}"
|
99
|
+
end
|
100
|
+
|
101
|
+
@options = {
|
102
|
+
}
|
103
|
+
opts = OptionParser.new do |o|
|
104
|
+
o.banner = 'Usage: config [options] <command>'
|
105
|
+
o.separator('')
|
106
|
+
|
107
|
+
o.on_tail('--reset', 'Reset all.') do
|
108
|
+
reset
|
109
|
+
exit
|
110
|
+
end
|
111
|
+
|
112
|
+
o.on_tail('--init', 'Set up all initial keys.') do
|
113
|
+
init
|
114
|
+
exit
|
115
|
+
end
|
116
|
+
|
117
|
+
o.on_tail('--reinit', 'Same as --reset and --init.') do
|
118
|
+
reset
|
119
|
+
init
|
120
|
+
exit
|
121
|
+
end
|
122
|
+
|
123
|
+
o.on_tail('-h', '--help', 'Show this message.') do
|
124
|
+
puts o
|
125
|
+
puts
|
126
|
+
puts 'Commands'
|
127
|
+
puts ' domain'
|
128
|
+
exit 3
|
129
|
+
end
|
130
|
+
end
|
131
|
+
ARGV << '-h' if ARGV.count == 0
|
132
|
+
commands = opts.parse(ARGV)
|
133
|
+
command = commands.shift
|
134
|
+
|
135
|
+
if command == 'domain'
|
136
|
+
subcommand = commands.shift
|
137
|
+
|
138
|
+
if subcommand == 'ignore'
|
139
|
+
subsubcommand = commands.shift
|
140
|
+
|
141
|
+
if subsubcommand == 'add'
|
142
|
+
commands.each do |domain|
|
143
|
+
@redis.write(['SADD', 'domains:ignore', domain.downcase])
|
144
|
+
puts @redis.read
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
data/bin/crawler
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: UTF-8
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'resque'
|
6
|
+
require 'resque-scheduler'
|
7
|
+
require 'time'
|
8
|
+
require 'sengi'
|
9
|
+
|
10
|
+
|
11
|
+
@options = {
|
12
|
+
'queue' => false,
|
13
|
+
'serial' => false,
|
14
|
+
'relative' => false,
|
15
|
+
'debug' => false,
|
16
|
+
}
|
17
|
+
opts = OptionParser.new do |o|
|
18
|
+
o.banner = 'Usage: crawler [options] <url...>'
|
19
|
+
o.separator('')
|
20
|
+
|
21
|
+
o.on('-q', '--queue', 'Enqueue a URL.') do
|
22
|
+
@options['queue'] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
o.on('-s', '--serial', 'Schedule the URLs serial.') do
|
26
|
+
# Set this option to true to schedule the URLs serial.
|
27
|
+
# The Redis key 'urls:schedule:last' will be used to store the last
|
28
|
+
# used schedule time and URL_DELAY will be added to create a new
|
29
|
+
# schedule time for the new URL.
|
30
|
+
# Otherwise if this option isn't used URL_DELAY will be added
|
31
|
+
# to the current time.
|
32
|
+
@options['serial'] = true
|
33
|
+
end
|
34
|
+
|
35
|
+
o.on('-r', '--relative', 'Follow only relative links.') do
|
36
|
+
# And also URLs with the same host.
|
37
|
+
@options['relative'] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
o.on('-f', '--force', 'Force a URL to be requested.') do
|
41
|
+
@options['force'] = true
|
42
|
+
end
|
43
|
+
|
44
|
+
o.on('-d', 'Debug') do
|
45
|
+
@options['debug'] = true
|
46
|
+
end
|
47
|
+
|
48
|
+
o.on_tail('-h', '--help', 'Show this message.') do
|
49
|
+
puts o
|
50
|
+
puts
|
51
|
+
exit 3
|
52
|
+
end
|
53
|
+
end
|
54
|
+
ARGV << '-h' if ARGV.count == 0
|
55
|
+
urls = opts.parse(ARGV)
|
56
|
+
|
57
|
+
Resque.redis = '127.0.0.1:7000'
|
58
|
+
urls.each_with_index do |url, index|
|
59
|
+
if @options['queue']
|
60
|
+
Resque.enqueue(TheFox::Sengi::CrawlerWorker, url, @options)
|
61
|
+
else
|
62
|
+
TheFox::Sengi::CrawlerWorker.perform(url, @options)
|
63
|
+
end
|
64
|
+
end
|