kabutops 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/kabutops.rb +0 -1
- data/lib/kabutops/adapters/base.rb +2 -2
- data/lib/kabutops/adapters/database_adapter.rb +1 -0
- data/lib/kabutops/crawler.rb +1 -1
- data/lib/kabutops/version.rb +1 -1
- metadata +9 -10
- data/lib/kabutops/spider.rb +0 -95
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87c6590089f52a068980373fdb2b060618b34940
|
4
|
+
data.tar.gz: 7505ade7b0104e76c62e73594de557cc75c07944
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a48556126558daa3e7592fcd4eaa7767b8cf3157a4de28e862800783fed992e37dc43995506511063aa15eccf1d6f4fe9b7605fc546c07627f393f57c6c61284
|
7
|
+
data.tar.gz: ae9e2711c691fef7135d1e45ea132ff0bd075df7e02c2ee6371251f0388c9b7f73751c539e384d531e9a0fc05be4852fccfe36226477a3b5993ed2dbda9a6861
|
data/README.md
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
# Kabutops [![Code Climate](https://codeclimate.com/github/reneklacan/kabutops.png)](https://codeclimate.com/github/reneklacan/kabutops) [![Coverage](https://codeclimate.com/github/reneklacan/kabutops/coverage.png)](https://codeclimate.com/github/reneklacan/kabutops)
|
2
2
|
|
3
|
-
Kabutops is a ruby library
|
3
|
+
Kabutops is a ruby library which aims to simplify creating website crawlers.
|
4
4
|
You can define what will be crawled and how it will be saved in the short class definition.
|
5
5
|
|
6
|
-
With Kabutops you can easily save data to **ElasticSearch**.
|
6
|
+
With Kabutops you can easily save data to **ElasticSearch 2.x**.
|
7
7
|
|
8
8
|
Example for every kind of database are located
|
9
9
|
in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
|
@@ -19,7 +19,7 @@ gem install kabutops
|
|
19
19
|
Or you can put it in your Gemfile
|
20
20
|
|
21
21
|
```ruby
|
22
|
-
gem 'kabutops'
|
22
|
+
gem 'kabutops'
|
23
23
|
```
|
24
24
|
|
25
25
|
You will also need Redis database installed and running.
|
data/lib/kabutops.rb
CHANGED
data/lib/kabutops/crawler.rb
CHANGED
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -58,42 +58,42 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '4.0'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '4.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: elasticsearch
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 1.0.14
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 1.0.14
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: hashie
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '3.
|
89
|
+
version: '3.4'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '3.
|
96
|
+
version: '3.4'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: json
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -174,7 +174,6 @@ files:
|
|
174
174
|
- lib/kabutops/extensions/parameterable.rb
|
175
175
|
- lib/kabutops/recipe.rb
|
176
176
|
- lib/kabutops/recipe_item.rb
|
177
|
-
- lib/kabutops/spider.rb
|
178
177
|
- lib/kabutops/version.rb
|
179
178
|
- lib/kabutops/watchdog.rb
|
180
179
|
homepage: https://github.com/reneklacan/kabutops
|
@@ -197,7 +196,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
197
196
|
version: '0'
|
198
197
|
requirements: []
|
199
198
|
rubyforge_project:
|
200
|
-
rubygems_version: 2.4.5
|
199
|
+
rubygems_version: 2.4.5.1
|
201
200
|
signing_key:
|
202
201
|
specification_version: 4
|
203
202
|
summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
|
data/lib/kabutops/spider.rb
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
class Spider < Crawler
|
6
|
-
class << self
|
7
|
-
params :url
|
8
|
-
callbacks :after_crawl, :before_cache, :follow_if
|
9
|
-
|
10
|
-
def debug_spider
|
11
|
-
enable_debug
|
12
|
-
self.new.perform({
|
13
|
-
url: params[:url]
|
14
|
-
})
|
15
|
-
end
|
16
|
-
|
17
|
-
def crawl collection=nil
|
18
|
-
super(collection || [{ url: params.url, }])
|
19
|
-
end
|
20
|
-
|
21
|
-
def reset!
|
22
|
-
super
|
23
|
-
redis.keys.each{ |k| redis.del(k) }
|
24
|
-
end
|
25
|
-
|
26
|
-
def << resource
|
27
|
-
if resource_status(resource).nil?
|
28
|
-
resource_status(resource, 'new')
|
29
|
-
super
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def follow link
|
34
|
-
self << {
|
35
|
-
url: URI.join(params.url, URI.escape(link)).to_s
|
36
|
-
}
|
37
|
-
end
|
38
|
-
|
39
|
-
def resource_status resource, status=nil
|
40
|
-
url_status(resource[:url], status)
|
41
|
-
end
|
42
|
-
|
43
|
-
def url_status url, status=nil
|
44
|
-
key = redis_key(url)
|
45
|
-
|
46
|
-
if status
|
47
|
-
redis.set(
|
48
|
-
key,
|
49
|
-
JSON.dump({
|
50
|
-
url: url,
|
51
|
-
status: status,
|
52
|
-
})
|
53
|
-
)
|
54
|
-
else
|
55
|
-
item = redis.get(key)
|
56
|
-
item ? JSON.parse(item)['status'] : nil
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
protected
|
61
|
-
|
62
|
-
def redis_key string
|
63
|
-
Digest::SHA256.hexdigest(string)
|
64
|
-
end
|
65
|
-
|
66
|
-
def redis
|
67
|
-
@redis ||= ::Redis::Namespace.new(
|
68
|
-
self.to_s,
|
69
|
-
redis: ::Redis.new(
|
70
|
-
host: Configuration[:redis][:host],
|
71
|
-
port: Configuration[:redis][:port],
|
72
|
-
db: Configuration[:redis][:db],
|
73
|
-
)
|
74
|
-
)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def crawl resource
|
79
|
-
page = super
|
80
|
-
after_crawl(resource, page)
|
81
|
-
self.class.resource_status(resource, 'done')
|
82
|
-
page
|
83
|
-
end
|
84
|
-
|
85
|
-
def after_crawl resource, page
|
86
|
-
page.css('a').each do |a|
|
87
|
-
next if a['href'].nil?
|
88
|
-
|
89
|
-
follow = self.class.notify(:follow_if, a['href']).any?
|
90
|
-
self.class.follow(a['href']) if follow
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
end
|