sengi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 027ebc7d9b5e1a8f6d3bb4b5c434beb27a152c24
4
+ data.tar.gz: c79a8f8dd0fdf168071b0ce93049f93e9fcb9422
5
+ SHA512:
6
+ metadata.gz: 3799cf4daca51694b6f375d7ed68db4111d21c95176860845a5970e25ee8d3d056250a45f630776d4922b9d835b93f4134fa7facc2c5f5a4809a4cca568f022a
7
+ data.tar.gz: 9fabd1047750167bb3e7daaa7638fcd013a3fb04ede390b8b845d7743190a97bc81819f0af59938da8d92ef4d7bb5e501f1c5642b5361d5a60da0d55fd183b31
@@ -0,0 +1,7 @@
1
+ /.bundle/
2
+ .setup
3
+ *.rdb
4
+ /bin/dev
5
+ /run/
6
+ /tmp/
7
+ README.html
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+
2
+ source 'https://rubygems.org'
3
+ gemspec
@@ -0,0 +1,71 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sengi (0.1.0.pre.dev.6)
5
+ activesupport (~> 4.2)
6
+ cookiejar (~> 0.3)
7
+ hiredis (~> 0.6)
8
+ nokogiri (~> 1.6)
9
+ redis (~> 3.2)
10
+ resque (~> 1.26)
11
+ resque-scheduler (~> 4.1)
12
+ thefox-ext (~> 1.4)
13
+
14
+ GEM
15
+ remote: https://rubygems.org/
16
+ specs:
17
+ activesupport (4.2.6)
18
+ i18n (~> 0.7)
19
+ json (~> 1.7, >= 1.7.7)
20
+ minitest (~> 5.1)
21
+ thread_safe (~> 0.3, >= 0.3.4)
22
+ tzinfo (~> 1.1)
23
+ cookiejar (0.3.3)
24
+ hiredis (0.6.1)
25
+ i18n (0.7.0)
26
+ json (1.8.3)
27
+ mini_portile2 (2.0.0)
28
+ minitest (5.8.4)
29
+ mono_logger (1.1.0)
30
+ multi_json (1.11.2)
31
+ nokogiri (1.6.7.2)
32
+ mini_portile2 (~> 2.0.0.rc2)
33
+ rack (1.6.4)
34
+ rack-protection (1.5.3)
35
+ rack
36
+ redis (3.3.0)
37
+ redis-namespace (1.5.2)
38
+ redis (~> 3.0, >= 3.0.4)
39
+ resque (1.26.0)
40
+ mono_logger (~> 1.0)
41
+ multi_json (~> 1.0)
42
+ redis-namespace (~> 1.3)
43
+ sinatra (>= 0.9.2)
44
+ vegas (~> 0.1.2)
45
+ resque-scheduler (4.1.0)
46
+ mono_logger (~> 1.0)
47
+ redis (~> 3.0)
48
+ resque (~> 1.25)
49
+ rufus-scheduler (~> 3.0)
50
+ rufus-scheduler (3.2.0)
51
+ sinatra (1.4.7)
52
+ rack (~> 1.5)
53
+ rack-protection (~> 1.4)
54
+ tilt (>= 1.3, < 3)
55
+ thefox-ext (1.4.1)
56
+ thread_safe (0.3.5)
57
+ tilt (2.0.2)
58
+ tzinfo (1.2.2)
59
+ thread_safe (~> 0.1)
60
+ vegas (0.1.11)
61
+ rack (>= 1.0.0)
62
+
63
+ PLATFORMS
64
+ ruby
65
+
66
+ DEPENDENCIES
67
+ minitest (~> 5.8)
68
+ sengi!
69
+
70
+ BUNDLED WITH
71
+ 1.11.2
@@ -0,0 +1,23 @@
1
+
2
+ GEM_NAME = sengi
3
+ ALL_TARGETS_EXT = tmp run init
4
+
5
+ include Makefile.common
6
+
7
+ dev:
8
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://dev.fox21.at/sengi/
9
+
10
+ run:
11
+ $(MKDIR) $@
12
+
13
+ .PHONY: reset
14
+ reset:
15
+ RUBYOPT=-rbundler/setup ruby ./bin/config --reset
16
+
17
+ .PHONY: init
18
+ init:
19
+ RUBYOPT=-rbundler/setup ruby ./bin/config --init
20
+
21
+ .PHONY: test
22
+ test:
23
+ RUBYOPT=-w $(BUNDLER) exec ./tests/ts_all.rb
@@ -0,0 +1,58 @@
1
+
2
+ # Ruby Common Big
3
+ # 2016-04-09
4
+
5
+ MV = mv -nv
6
+ RM = rm -rf
7
+ MKDIR = mkdir -p
8
+ CHMOD = chmod
9
+ BUNDLER = bundle
10
+ BUNDLER_OPTIONS = --jobs=5 --retry=3
11
+ GEMSPEC_FILE = $(GEM_NAME).gemspec
12
+
13
+ .PHONY: all
14
+ all: setup $(ALL_TARGETS_EXT)
15
+
16
+ .PHONY: setup
17
+ setup: .setup
18
+
19
+ .setup:
20
+ $(BUNDLER) install $(BUNDLER_OPTIONS)
21
+ touch $@
22
+
23
+ .PHONY: install
24
+ install:
25
+ gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
26
+ sudo gem install $$gem_file; \
27
+ $(RM) $$gem_file
28
+
29
+ .PHONY: uninstall
30
+ uninstall:
31
+ sudo gem uninstall $(GEM_NAME)
32
+
33
+ .PHONY: update
34
+ update:
35
+ $(BUNDLER) update
36
+
37
+ .PHONY: clean
38
+ clean:
39
+ $(RM) .bundle
40
+ $(RM) .setup
41
+ $(RM) Gemfile.lock
42
+
43
+ .PHONY: release
44
+ release: | releases
45
+ set -e; \
46
+ gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
47
+ dst="releases/$$gem_file"; \
48
+ [ ! -f $$dst ]; \
49
+ $(MV) $$gem_file releases; \
50
+ gem push $$dst; \
51
+ echo 'done'
52
+
53
+ releases:
54
+ $(MKDIR) $@
55
+
56
+ tmp:
57
+ $(MKDIR) $@
58
+ $(CHMOD) u=rwx,go-rwx $@
@@ -0,0 +1,59 @@
1
+ # Sengi Web Crawler
2
+
3
+ A web crawler using Ruby and Redis.
4
+
5
+ ## Install
6
+
7
+ First, run:
8
+
9
+ gem install rake bundler nokogiri hiredis
10
+ make
11
+
12
+ ## Setup
13
+
14
+ [Redis](http://redis.io/) is used to store everything. So it's always be needed to run Sengi.
15
+
16
+ Start Redis:
17
+
18
+ ./bin/redis
19
+
20
+ Start [Resque](https://github.com/resque/resque) -- Scheduler and Worker:
21
+
22
+ ./bin/resque_scheduler_start
23
+ ./bin/resque_crawler_start
24
+
25
+ To get a Resque web dashboard at <http://localhost:8282>, run:
26
+
27
+ ./bin/resque_server
28
+
29
+ Init Sengi. This sets default variables to Redis and a blacklist of the deepweb.
30
+
31
+ RUBYOPT=-rbundler/setup ruby ./bin/config --init
32
+
33
+ ## Usage
34
+
35
+ ### Queue
36
+
37
+ To queue a URL to be crawled, run:
38
+
39
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://example.com
40
+
41
+ ### Relative Links Only
42
+
43
+ To crawl only relative links on `example.com`:
44
+
45
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -r http://example.com
46
+
47
+ ### Serial
48
+
49
+ Crawl only one URL at a time. The latest datetime will be stored into Redis key `urls:schedule:last`. A new URL to crawl will be scheduled for a new datetime calculated by `urls:schedule:last + url_delay`. Where `url_delay` is the number of seconds between the scheduled URLs.
50
+
51
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -s http://example.com
52
+
53
+ ## License
54
+
55
+ Copyright (C) 2016 Christian Mayer <http://fox21.at>
56
+
57
+ This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
58
+
59
+ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
@@ -0,0 +1,25 @@
1
+
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'resque'
6
+ require 'resque/tasks'
7
+ require 'resque-scheduler'
8
+ require 'resque/scheduler/tasks'
9
+
10
+ require 'sengi'
11
+
12
+ namespace :resque do
13
+ task :setup do
14
+ puts 'resque setup'
15
+ #require 'resque'
16
+ Resque.redis = '127.0.0.1:7000'
17
+ end
18
+
19
+ task :setup_schedule => :setup do
20
+ puts 'schedule setup'
21
+ #require 'resque-scheduler'
22
+ end
23
+
24
+ task :scheduler => :setup_schedule
25
+ end
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: UTF-8
3
+
4
+ require 'optparse'
5
+ require 'hiredis'
6
+ require 'sengi'
7
+
8
+
9
+ @redis = Hiredis::Connection.new
10
+ @redis.connect('127.0.0.1', 7000)
11
+ @redis.write(['SELECT', 1])
12
+ @redis.read
13
+
14
+ def reset
15
+ (0..1).each do |n|
16
+ @redis.write(['SELECT', n])
17
+ @redis.read
18
+ @redis.write(['FLUSHDB'])
19
+ puts "FLUSH DB #{n}: #{@redis.read}"
20
+ end
21
+ end
22
+
23
+ def init
24
+ @redis.write(['SET', 'urls:schedule:lock', 0])
25
+ puts "urls schedule lock: #{@redis.read}"
26
+
27
+ @redis.write(['SET', 'urls:schedule:last', (Time.now + 180).strftime('%F %T %z')])
28
+ puts "urls schedule last: #{@redis.read}"
29
+
30
+ @redis.write(['SET', 'urls:delay', TheFox::Sengi::URL_DELAY])
31
+ puts "urls delay: #{@redis.read}"
32
+
33
+ @redis.write(['SET', 'urls:separatedelay', TheFox::Sengi::URL_SEPARATE_DELAY])
34
+ puts "urls separatedelay: #{@redis.read}"
35
+
36
+ @redis.write(['SET', 'urls:reschedule', TheFox::Sengi::URL_RESCHEDULE])
37
+ puts "urls reschedule: #{@redis.read}"
38
+
39
+ @redis.write(['SADD', 'domains:ignore',
40
+ '4chan.org',
41
+ 'about.me',
42
+ 'amazon',
43
+ 'ask.fm',
44
+ 'bitbucket.org',
45
+ 'bit.ly', 'bitly.com',
46
+ 'bbc.com',
47
+ 'blockchain.info',
48
+ 'blogger.com',
49
+ 'blogspot',
50
+ 'cnet.com',
51
+ 'cnn.com',
52
+ 'delicious.com',
53
+ 'digg.com',
54
+ 'disqus.com',
55
+ 'doodle.com',
56
+ 'dropbox.com',
57
+ 'droplr.com',
58
+ 'duckduckgo.com',
59
+ 'ebay.com',
60
+ 'facebook.com', 'fb.com', 'fb.me',
61
+ 'flickr.com',
62
+ 'getpocket.com',
63
+ 'github.com',
64
+ 'google',
65
+ 'gravatar.com',
66
+ 'imdb.com',
67
+ 'imgur.com',
68
+ 'instagram.com',
69
+ 'jsbin.com',
70
+ 'jsfiddle.net',
71
+ 'keybase.io',
72
+ 'kickstarter.com',
73
+ 'linkedin.com',
74
+ 'localhost',
75
+ 'myspace.com',
76
+ 'npmjs.com',
77
+ 'openstreetmap.org', 'osm.org',
78
+ 'packagist.org',
79
+ 'pastebin.com',
80
+ 'paypal.com',
81
+ 'reddit.com',
82
+ 'skype.com',
83
+ 'slack.com',
84
+ 'slashdot.org',
85
+ 'soundcloud.com',
86
+ 'thepiratebay',
87
+ 'tumblr.com',
88
+ 'twitpic.com',
89
+ 'twitter.com',
90
+ 'vimeo.com',
91
+ 'wikipedia.org',
92
+ 'willhaben.at',
93
+ 'ycombinator.com',
94
+ 'xing.com',
95
+ 'yahoo.com',
96
+ 'youtube',
97
+ ])
98
+ puts "domains ignore: #{@redis.read}"
99
+ end
100
+
101
+ @options = {
102
+ }
103
+ opts = OptionParser.new do |o|
104
+ o.banner = 'Usage: config [options] <command>'
105
+ o.separator('')
106
+
107
+ o.on_tail('--reset', 'Reset all.') do
108
+ reset
109
+ exit
110
+ end
111
+
112
+ o.on_tail('--init', 'Set up all initial keys.') do
113
+ init
114
+ exit
115
+ end
116
+
117
+ o.on_tail('--reinit', 'Same as --reset and --init.') do
118
+ reset
119
+ init
120
+ exit
121
+ end
122
+
123
+ o.on_tail('-h', '--help', 'Show this message.') do
124
+ puts o
125
+ puts
126
+ puts 'Commands'
127
+ puts ' domain'
128
+ exit 3
129
+ end
130
+ end
131
+ ARGV << '-h' if ARGV.count == 0
132
+ commands = opts.parse(ARGV)
133
+ command = commands.shift
134
+
135
+ if command == 'domain'
136
+ subcommand = commands.shift
137
+
138
+ if subcommand == 'ignore'
139
+ subsubcommand = commands.shift
140
+
141
+ if subsubcommand == 'add'
142
+ commands.each do |domain|
143
+ @redis.write(['SADD', 'domains:ignore', domain.downcase])
144
+ puts @redis.read
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: UTF-8
3
+
4
+ require 'optparse'
5
+ require 'resque'
6
+ require 'resque-scheduler'
7
+ require 'time'
8
+ require 'sengi'
9
+
10
+
11
+ @options = {
12
+ 'queue' => false,
13
+ 'serial' => false,
14
+ 'relative' => false,
15
+ 'debug' => false,
16
+ }
17
+ opts = OptionParser.new do |o|
18
+ o.banner = 'Usage: crawler [options] <url...>'
19
+ o.separator('')
20
+
21
+ o.on('-q', '--queue', 'Enqueue a URL.') do
22
+ @options['queue'] = true
23
+ end
24
+
25
+ o.on('-s', '--serial', 'Schedule the URLs serial.') do
26
+ # Set this option to true to schedule the URLs serial.
27
+ # The Redis key 'urls:schedule:last' will be used to store the last
28
+ # used schedule time and URL_DELAY will be added to create a new
29
+ # schedule time for the new URL.
30
+ # Otherwise if this option isn't used URL_DELAY will be added
31
+ # to the current time.
32
+ @options['serial'] = true
33
+ end
34
+
35
+ o.on('-r', '--relative', 'Follow only relative links.') do
36
+ # And also URLs with the same host.
37
+ @options['relative'] = true
38
+ end
39
+
40
+ o.on('-f', '--force', 'Force a URL to be requested.') do
41
+ @options['force'] = true
42
+ end
43
+
44
+ o.on('-d', 'Debug') do
45
+ @options['debug'] = true
46
+ end
47
+
48
+ o.on_tail('-h', '--help', 'Show this message.') do
49
+ puts o
50
+ puts
51
+ exit 3
52
+ end
53
+ end
54
+ ARGV << '-h' if ARGV.count == 0
55
+ urls = opts.parse(ARGV)
56
+
57
+ Resque.redis = '127.0.0.1:7000'
58
+ urls.each_with_index do |url, index|
59
+ if @options['queue']
60
+ Resque.enqueue(TheFox::Sengi::CrawlerWorker, url, @options)
61
+ else
62
+ TheFox::Sengi::CrawlerWorker.perform(url, @options)
63
+ end
64
+ end