sengi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 027ebc7d9b5e1a8f6d3bb4b5c434beb27a152c24
4
+ data.tar.gz: c79a8f8dd0fdf168071b0ce93049f93e9fcb9422
5
+ SHA512:
6
+ metadata.gz: 3799cf4daca51694b6f375d7ed68db4111d21c95176860845a5970e25ee8d3d056250a45f630776d4922b9d835b93f4134fa7facc2c5f5a4809a4cca568f022a
7
+ data.tar.gz: 9fabd1047750167bb3e7daaa7638fcd013a3fb04ede390b8b845d7743190a97bc81819f0af59938da8d92ef4d7bb5e501f1c5642b5361d5a60da0d55fd183b31
@@ -0,0 +1,7 @@
1
+ /.bundle/
2
+ .setup
3
+ *.rdb
4
+ /bin/dev
5
+ /run/
6
+ /tmp/
7
+ README.html
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+
2
+ source 'https://rubygems.org'
3
+ gemspec
@@ -0,0 +1,71 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sengi (0.1.0.pre.dev.6)
5
+ activesupport (~> 4.2)
6
+ cookiejar (~> 0.3)
7
+ hiredis (~> 0.6)
8
+ nokogiri (~> 1.6)
9
+ redis (~> 3.2)
10
+ resque (~> 1.26)
11
+ resque-scheduler (~> 4.1)
12
+ thefox-ext (~> 1.4)
13
+
14
+ GEM
15
+ remote: https://rubygems.org/
16
+ specs:
17
+ activesupport (4.2.6)
18
+ i18n (~> 0.7)
19
+ json (~> 1.7, >= 1.7.7)
20
+ minitest (~> 5.1)
21
+ thread_safe (~> 0.3, >= 0.3.4)
22
+ tzinfo (~> 1.1)
23
+ cookiejar (0.3.3)
24
+ hiredis (0.6.1)
25
+ i18n (0.7.0)
26
+ json (1.8.3)
27
+ mini_portile2 (2.0.0)
28
+ minitest (5.8.4)
29
+ mono_logger (1.1.0)
30
+ multi_json (1.11.2)
31
+ nokogiri (1.6.7.2)
32
+ mini_portile2 (~> 2.0.0.rc2)
33
+ rack (1.6.4)
34
+ rack-protection (1.5.3)
35
+ rack
36
+ redis (3.3.0)
37
+ redis-namespace (1.5.2)
38
+ redis (~> 3.0, >= 3.0.4)
39
+ resque (1.26.0)
40
+ mono_logger (~> 1.0)
41
+ multi_json (~> 1.0)
42
+ redis-namespace (~> 1.3)
43
+ sinatra (>= 0.9.2)
44
+ vegas (~> 0.1.2)
45
+ resque-scheduler (4.1.0)
46
+ mono_logger (~> 1.0)
47
+ redis (~> 3.0)
48
+ resque (~> 1.25)
49
+ rufus-scheduler (~> 3.0)
50
+ rufus-scheduler (3.2.0)
51
+ sinatra (1.4.7)
52
+ rack (~> 1.5)
53
+ rack-protection (~> 1.4)
54
+ tilt (>= 1.3, < 3)
55
+ thefox-ext (1.4.1)
56
+ thread_safe (0.3.5)
57
+ tilt (2.0.2)
58
+ tzinfo (1.2.2)
59
+ thread_safe (~> 0.1)
60
+ vegas (0.1.11)
61
+ rack (>= 1.0.0)
62
+
63
+ PLATFORMS
64
+ ruby
65
+
66
+ DEPENDENCIES
67
+ minitest (~> 5.8)
68
+ sengi!
69
+
70
+ BUNDLED WITH
71
+ 1.11.2
@@ -0,0 +1,23 @@
1
+
2
+ GEM_NAME = sengi
3
+ ALL_TARGETS_EXT = tmp run init
4
+
5
+ include Makefile.common
6
+
7
+ dev:
8
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://dev.fox21.at/sengi/
9
+
10
+ run:
11
+ $(MKDIR) $@
12
+
13
+ .PHONY: reset
14
+ reset:
15
+ RUBYOPT=-rbundler/setup ruby ./bin/config --reset
16
+
17
+ .PHONY: init
18
+ init:
19
+ RUBYOPT=-rbundler/setup ruby ./bin/config --init
20
+
21
+ .PHONY: test
22
+ test:
23
+ RUBYOPT=-w $(BUNDLER) exec ./tests/ts_all.rb
@@ -0,0 +1,58 @@
1
+
2
+ # Ruby Common Big
3
+ # 2016-04-09
4
+
5
+ MV = mv -nv
6
+ RM = rm -rf
7
+ MKDIR = mkdir -p
8
+ CHMOD = chmod
9
+ BUNDLER = bundle
10
+ BUNDLER_OPTIONS = --jobs=5 --retry=3
11
+ GEMSPEC_FILE = $(GEM_NAME).gemspec
12
+
13
+ .PHONY: all
14
+ all: setup $(ALL_TARGETS_EXT)
15
+
16
+ .PHONY: setup
17
+ setup: .setup
18
+
19
+ .setup:
20
+ $(BUNDLER) install $(BUNDLER_OPTIONS)
21
+ touch $@
22
+
23
+ .PHONY: install
24
+ install:
25
+ gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
26
+ sudo gem install $$gem_file; \
27
+ $(RM) $$gem_file
28
+
29
+ .PHONY: uninstall
30
+ uninstall:
31
+ sudo gem uninstall $(GEM_NAME)
32
+
33
+ .PHONY: update
34
+ update:
35
+ $(BUNDLER) update
36
+
37
+ .PHONY: clean
38
+ clean:
39
+ $(RM) .bundle
40
+ $(RM) .setup
41
+ $(RM) Gemfile.lock
42
+
43
+ .PHONY: release
44
+ release: | releases
45
+ set -e; \
46
+ gem_file=$$(gem build $(GEMSPEC_FILE) | grep 'File:' | tail -1 | awk '{ print $$2 }'); \
47
+ dst="releases/$$gem_file"; \
48
+ [ ! -f $$dst ]; \
49
+ $(MV) $$gem_file releases; \
50
+ gem push $$dst; \
51
+ echo 'done'
52
+
53
+ releases:
54
+ $(MKDIR) $@
55
+
56
+ tmp:
57
+ $(MKDIR) $@
58
+ $(CHMOD) u=rwx,go-rwx $@
@@ -0,0 +1,59 @@
1
+ # Sengi Web Crawler
2
+
3
+ A web crawler using Ruby and Redis.
4
+
5
+ ## Install
6
+
7
+ First, run:
8
+
9
+ gem install rake bundler nokogiri hiredis
10
+ make
11
+
12
+ ## Setup
13
+
14
+ [Redis](http://redis.io/) is used to store everything. So it's always be needed to run Sengi.
15
+
16
+ Start Redis:
17
+
18
+ ./bin/redis
19
+
20
+ Start [Resque](https://github.com/resque/resque) -- Scheduler and Worker:
21
+
22
+ ./bin/resque_scheduler_start
23
+ ./bin/resque_crawler_start
24
+
25
+ To get a Resque web dashboard at <http://localhost:8282>, run:
26
+
27
+ ./bin/resque_server
28
+
29
+ Init Sengi. This sets default variables to Redis and a blacklist of the deepweb.
30
+
31
+ RUBYOPT=-rbundler/setup ruby ./bin/config --init
32
+
33
+ ## Usage
34
+
35
+ ### Queue
36
+
37
+ To queue a URL to be crawled, run:
38
+
39
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -q http://example.com
40
+
41
+ ### Relative Links Only
42
+
43
+ To crawl only relative links on `example.com`:
44
+
45
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -r http://example.com
46
+
47
+ ### Serial
48
+
49
+ Crawl only one URL at a time. The latest datetime will be stored into Redis key `urls:schedule:last`. A new URL to crawl will be scheduled for a new datetime calculated by `urls:schedule:last + url_delay`. Where `url_delay` is the number of seconds between the scheduled URLs.
50
+
51
+ RUBYOPT=-rbundler/setup ruby ./bin/crawler -s http://example.com
52
+
53
+ ## License
54
+
55
+ Copyright (C) 2016 Christian Mayer <http://fox21.at>
56
+
57
+ This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
58
+
59
+ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
@@ -0,0 +1,25 @@
1
+
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'resque'
6
+ require 'resque/tasks'
7
+ require 'resque-scheduler'
8
+ require 'resque/scheduler/tasks'
9
+
10
+ require 'sengi'
11
+
12
+ namespace :resque do
13
+ task :setup do
14
+ puts 'resque setup'
15
+ #require 'resque'
16
+ Resque.redis = '127.0.0.1:7000'
17
+ end
18
+
19
+ task :setup_schedule => :setup do
20
+ puts 'schedule setup'
21
+ #require 'resque-scheduler'
22
+ end
23
+
24
+ task :scheduler => :setup_schedule
25
+ end
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: UTF-8
3
+
4
+ require 'optparse'
5
+ require 'hiredis'
6
+ require 'sengi'
7
+
8
+
9
+ @redis = Hiredis::Connection.new
10
+ @redis.connect('127.0.0.1', 7000)
11
+ @redis.write(['SELECT', 1])
12
+ @redis.read
13
+
14
+ def reset
15
+ (0..1).each do |n|
16
+ @redis.write(['SELECT', n])
17
+ @redis.read
18
+ @redis.write(['FLUSHDB'])
19
+ puts "FLUSH DB #{n}: #{@redis.read}"
20
+ end
21
+ end
22
+
23
+ def init
24
+ @redis.write(['SET', 'urls:schedule:lock', 0])
25
+ puts "urls schedule lock: #{@redis.read}"
26
+
27
+ @redis.write(['SET', 'urls:schedule:last', (Time.now + 180).strftime('%F %T %z')])
28
+ puts "urls schedule last: #{@redis.read}"
29
+
30
+ @redis.write(['SET', 'urls:delay', TheFox::Sengi::URL_DELAY])
31
+ puts "urls delay: #{@redis.read}"
32
+
33
+ @redis.write(['SET', 'urls:separatedelay', TheFox::Sengi::URL_SEPARATE_DELAY])
34
+ puts "urls separatedelay: #{@redis.read}"
35
+
36
+ @redis.write(['SET', 'urls:reschedule', TheFox::Sengi::URL_RESCHEDULE])
37
+ puts "urls reschedule: #{@redis.read}"
38
+
39
+ @redis.write(['SADD', 'domains:ignore',
40
+ '4chan.org',
41
+ 'about.me',
42
+ 'amazon',
43
+ 'ask.fm',
44
+ 'bitbucket.org',
45
+ 'bit.ly', 'bitly.com',
46
+ 'bbc.com',
47
+ 'blockchain.info',
48
+ 'blogger.com',
49
+ 'blogspot',
50
+ 'cnet.com',
51
+ 'cnn.com',
52
+ 'delicious.com',
53
+ 'digg.com',
54
+ 'disqus.com',
55
+ 'doodle.com',
56
+ 'dropbox.com',
57
+ 'droplr.com',
58
+ 'duckduckgo.com',
59
+ 'ebay.com',
60
+ 'facebook.com', 'fb.com', 'fb.me',
61
+ 'flickr.com',
62
+ 'getpocket.com',
63
+ 'github.com',
64
+ 'google',
65
+ 'gravatar.com',
66
+ 'imdb.com',
67
+ 'imgur.com',
68
+ 'instagram.com',
69
+ 'jsbin.com',
70
+ 'jsfiddle.net',
71
+ 'keybase.io',
72
+ 'kickstarter.com',
73
+ 'linkedin.com',
74
+ 'localhost',
75
+ 'myspace.com',
76
+ 'npmjs.com',
77
+ 'openstreetmap.org', 'osm.org',
78
+ 'packagist.org',
79
+ 'pastebin.com',
80
+ 'paypal.com',
81
+ 'reddit.com',
82
+ 'skype.com',
83
+ 'slack.com',
84
+ 'slashdot.org',
85
+ 'soundcloud.com',
86
+ 'thepiratebay',
87
+ 'tumblr.com',
88
+ 'twitpic.com',
89
+ 'twitter.com',
90
+ 'vimeo.com',
91
+ 'wikipedia.org',
92
+ 'willhaben.at',
93
+ 'ycombinator.com',
94
+ 'xing.com',
95
+ 'yahoo.com',
96
+ 'youtube',
97
+ ])
98
+ puts "domains ignore: #{@redis.read}"
99
+ end
100
+
101
+ @options = {
102
+ }
103
+ opts = OptionParser.new do |o|
104
+ o.banner = 'Usage: config [options] <command>'
105
+ o.separator('')
106
+
107
+ o.on_tail('--reset', 'Reset all.') do
108
+ reset
109
+ exit
110
+ end
111
+
112
+ o.on_tail('--init', 'Set up all initial keys.') do
113
+ init
114
+ exit
115
+ end
116
+
117
+ o.on_tail('--reinit', 'Same as --reset and --init.') do
118
+ reset
119
+ init
120
+ exit
121
+ end
122
+
123
+ o.on_tail('-h', '--help', 'Show this message.') do
124
+ puts o
125
+ puts
126
+ puts 'Commands'
127
+ puts ' domain'
128
+ exit 3
129
+ end
130
+ end
131
+ ARGV << '-h' if ARGV.count == 0
132
+ commands = opts.parse(ARGV)
133
+ command = commands.shift
134
+
135
+ if command == 'domain'
136
+ subcommand = commands.shift
137
+
138
+ if subcommand == 'ignore'
139
+ subsubcommand = commands.shift
140
+
141
+ if subsubcommand == 'add'
142
+ commands.each do |domain|
143
+ @redis.write(['SADD', 'domains:ignore', domain.downcase])
144
+ puts @redis.read
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: UTF-8
3
+
4
+ require 'optparse'
5
+ require 'resque'
6
+ require 'resque-scheduler'
7
+ require 'time'
8
+ require 'sengi'
9
+
10
+
11
+ @options = {
12
+ 'queue' => false,
13
+ 'serial' => false,
14
+ 'relative' => false,
15
+ 'debug' => false,
16
+ }
17
+ opts = OptionParser.new do |o|
18
+ o.banner = 'Usage: crawler [options] <url...>'
19
+ o.separator('')
20
+
21
+ o.on('-q', '--queue', 'Enqueue a URL.') do
22
+ @options['queue'] = true
23
+ end
24
+
25
+ o.on('-s', '--serial', 'Schedule the URLs serial.') do
26
+ # Set this option to true to schedule the URLs serial.
27
+ # The Redis key 'urls:schedule:last' will be used to store the last
28
+ # used schedule time and URL_DELAY will be added to create a new
29
+ # schedule time for the new URL.
30
+ # Otherwise if this option isn't used URL_DELAY will be added
31
+ # to the current time.
32
+ @options['serial'] = true
33
+ end
34
+
35
+ o.on('-r', '--relative', 'Follow only relative links.') do
36
+ # And also URLs with the same host.
37
+ @options['relative'] = true
38
+ end
39
+
40
+ o.on('-f', '--force', 'Force a URL to be requested.') do
41
+ @options['force'] = true
42
+ end
43
+
44
+ o.on('-d', 'Debug') do
45
+ @options['debug'] = true
46
+ end
47
+
48
+ o.on_tail('-h', '--help', 'Show this message.') do
49
+ puts o
50
+ puts
51
+ exit 3
52
+ end
53
+ end
54
+ ARGV << '-h' if ARGV.count == 0
55
+ urls = opts.parse(ARGV)
56
+
57
+ Resque.redis = '127.0.0.1:7000'
58
+ urls.each_with_index do |url, index|
59
+ if @options['queue']
60
+ Resque.enqueue(TheFox::Sengi::CrawlerWorker, url, @options)
61
+ else
62
+ TheFox::Sengi::CrawlerWorker.perform(url, @options)
63
+ end
64
+ end