kabutops 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/kabutops.rb +0 -1
- data/lib/kabutops/adapters/base.rb +2 -2
- data/lib/kabutops/adapters/database_adapter.rb +1 -0
- data/lib/kabutops/crawler.rb +1 -1
- data/lib/kabutops/version.rb +1 -1
- metadata +9 -10
- data/lib/kabutops/spider.rb +0 -95
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 87c6590089f52a068980373fdb2b060618b34940
|
|
4
|
+
data.tar.gz: 7505ade7b0104e76c62e73594de557cc75c07944
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a48556126558daa3e7592fcd4eaa7767b8cf3157a4de28e862800783fed992e37dc43995506511063aa15eccf1d6f4fe9b7605fc546c07627f393f57c6c61284
|
|
7
|
+
data.tar.gz: ae9e2711c691fef7135d1e45ea132ff0bd075df7e02c2ee6371251f0388c9b7f73751c539e384d531e9a0fc05be4852fccfe36226477a3b5993ed2dbda9a6861
|
data/README.md
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# Kabutops [](https://codeclimate.com/github/reneklacan/kabutops) [](https://codeclimate.com/github/reneklacan/kabutops)
|
|
2
2
|
|
|
3
|
-
Kabutops is a ruby library
|
|
3
|
+
Kabutops is a ruby library which aims to simplify creating website crawlers.
|
|
4
4
|
You can define what will be crawled and how it will be saved in the short class definition.
|
|
5
5
|
|
|
6
|
-
With Kabutops you can easily save data to **ElasticSearch**.
|
|
6
|
+
With Kabutops you can easily save data to **ElasticSearch 2.x**.
|
|
7
7
|
|
|
8
8
|
Example for every kind of database are located
|
|
9
9
|
in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
|
|
@@ -19,7 +19,7 @@ gem install kabutops
|
|
|
19
19
|
Or you can put it in your Gemfile
|
|
20
20
|
|
|
21
21
|
```ruby
|
|
22
|
-
gem 'kabutops'
|
|
22
|
+
gem 'kabutops'
|
|
23
23
|
```
|
|
24
24
|
|
|
25
25
|
You will also need Redis database installed and running.
|
data/lib/kabutops.rb
CHANGED
data/lib/kabutops/crawler.rb
CHANGED
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kabutops
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Rene Klacan
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-
|
|
11
|
+
date: 2015-11-23 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -58,42 +58,42 @@ dependencies:
|
|
|
58
58
|
requirements:
|
|
59
59
|
- - "~>"
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
|
-
version: '
|
|
61
|
+
version: '4.0'
|
|
62
62
|
type: :runtime
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
66
|
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '
|
|
68
|
+
version: '4.0'
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: elasticsearch
|
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
73
|
- - "~>"
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
|
-
version:
|
|
75
|
+
version: 1.0.14
|
|
76
76
|
type: :runtime
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
80
|
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
|
-
version:
|
|
82
|
+
version: 1.0.14
|
|
83
83
|
- !ruby/object:Gem::Dependency
|
|
84
84
|
name: hashie
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
87
|
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '3.
|
|
89
|
+
version: '3.4'
|
|
90
90
|
type: :runtime
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
94
|
- - "~>"
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '3.
|
|
96
|
+
version: '3.4'
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
98
|
name: json
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -174,7 +174,6 @@ files:
|
|
|
174
174
|
- lib/kabutops/extensions/parameterable.rb
|
|
175
175
|
- lib/kabutops/recipe.rb
|
|
176
176
|
- lib/kabutops/recipe_item.rb
|
|
177
|
-
- lib/kabutops/spider.rb
|
|
178
177
|
- lib/kabutops/version.rb
|
|
179
178
|
- lib/kabutops/watchdog.rb
|
|
180
179
|
homepage: https://github.com/reneklacan/kabutops
|
|
@@ -197,7 +196,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
197
196
|
version: '0'
|
|
198
197
|
requirements: []
|
|
199
198
|
rubyforge_project:
|
|
200
|
-
rubygems_version: 2.4.5
|
|
199
|
+
rubygems_version: 2.4.5.1
|
|
201
200
|
signing_key:
|
|
202
201
|
specification_version: 4
|
|
203
202
|
summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
|
data/lib/kabutops/spider.rb
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
|
2
|
-
|
|
3
|
-
module Kabutops
|
|
4
|
-
|
|
5
|
-
class Spider < Crawler
|
|
6
|
-
class << self
|
|
7
|
-
params :url
|
|
8
|
-
callbacks :after_crawl, :before_cache, :follow_if
|
|
9
|
-
|
|
10
|
-
def debug_spider
|
|
11
|
-
enable_debug
|
|
12
|
-
self.new.perform({
|
|
13
|
-
url: params[:url]
|
|
14
|
-
})
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def crawl collection=nil
|
|
18
|
-
super(collection || [{ url: params.url, }])
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def reset!
|
|
22
|
-
super
|
|
23
|
-
redis.keys.each{ |k| redis.del(k) }
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
def << resource
|
|
27
|
-
if resource_status(resource).nil?
|
|
28
|
-
resource_status(resource, 'new')
|
|
29
|
-
super
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def follow link
|
|
34
|
-
self << {
|
|
35
|
-
url: URI.join(params.url, URI.escape(link)).to_s
|
|
36
|
-
}
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
def resource_status resource, status=nil
|
|
40
|
-
url_status(resource[:url], status)
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
def url_status url, status=nil
|
|
44
|
-
key = redis_key(url)
|
|
45
|
-
|
|
46
|
-
if status
|
|
47
|
-
redis.set(
|
|
48
|
-
key,
|
|
49
|
-
JSON.dump({
|
|
50
|
-
url: url,
|
|
51
|
-
status: status,
|
|
52
|
-
})
|
|
53
|
-
)
|
|
54
|
-
else
|
|
55
|
-
item = redis.get(key)
|
|
56
|
-
item ? JSON.parse(item)['status'] : nil
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
protected
|
|
61
|
-
|
|
62
|
-
def redis_key string
|
|
63
|
-
Digest::SHA256.hexdigest(string)
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def redis
|
|
67
|
-
@redis ||= ::Redis::Namespace.new(
|
|
68
|
-
self.to_s,
|
|
69
|
-
redis: ::Redis.new(
|
|
70
|
-
host: Configuration[:redis][:host],
|
|
71
|
-
port: Configuration[:redis][:port],
|
|
72
|
-
db: Configuration[:redis][:db],
|
|
73
|
-
)
|
|
74
|
-
)
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def crawl resource
|
|
79
|
-
page = super
|
|
80
|
-
after_crawl(resource, page)
|
|
81
|
-
self.class.resource_status(resource, 'done')
|
|
82
|
-
page
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def after_crawl resource, page
|
|
86
|
-
page.css('a').each do |a|
|
|
87
|
-
next if a['href'].nil?
|
|
88
|
-
|
|
89
|
-
follow = self.class.notify(:follow_if, a['href']).any?
|
|
90
|
-
self.class.follow(a['href']) if follow
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
end
|