organic-sitemap 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +54 -2
- data/lib/generators/organic_sitemap/templates/organic_sitemap.rb +7 -0
- data/lib/organic-sitemap.rb +3 -0
- data/lib/organic-sitemap/cache_manager.rb +14 -0
- data/lib/organic-sitemap/configuration.rb +2 -0
- data/lib/organic-sitemap/crawler_manager.rb +25 -0
- data/lib/organic-sitemap/redis_manager.rb +1 -0
- data/lib/organic-sitemap/url_processor.rb +4 -0
- data/lib/organic-sitemap/version.rb +1 -1
- data/organic_sitemap.gemspec +3 -1
- metadata +34 -5
- data/organic-sitemap-0.1.1.gem +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e531fda12be1284f92a9281f34a951e1e38e8bc
|
4
|
+
data.tar.gz: 710e256a9584b0b5aaf05920f7d7758f32abc935
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c17c4decabef8100527d31677d079ec99f03ad4ae342aeef75c53314593ace1266e1eb16c0887de9da6c2e7454c7e7b93d2478d0277caa56b24829491c00fac
|
7
|
+
data.tar.gz: 7abeeb1efc8acfcee36d4a4254b90a83e40255108ea1bc33bcc9b664817cd4f427f89e94d181eeb2d4c8ec2b8992bd8466712776fa35092749efb5d8410108c7
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -36,16 +36,68 @@ OrganicSitemap.configure do |config|
|
|
36
36
|
# OrganicSitemap ignore query_params to identify urls. You can add you allowed params
|
37
37
|
# config.allowed_params = [...]
|
38
38
|
|
39
|
-
# By default, all urls are saved on Redis.new(url: 'redis://127.0.0.1:6379'),
|
39
|
+
# By default, all urls are saved on Redis.new(url: 'redis://127.0.0.1:6379'),
|
40
|
+
# but you can set you own connection
|
40
41
|
# config.redis_connection = Your redis connection
|
41
42
|
|
42
43
|
# url are saved on a set on redis called "sitemap-urls", but if you want you can change it
|
43
44
|
# config.storage_key = your key
|
44
45
|
|
45
|
-
# By dafault all url have a expiry time in 7 days
|
46
|
+
# By dafault all url have a expiry time in 7 days
|
47
|
+
# after this time, if no one load this page, it will be removed from the set.
|
48
|
+
# To change it (number of days):
|
46
49
|
# config.expiry_time = X
|
50
|
+
|
51
|
+
# Crawler needs a domain to mount urls to visit
|
52
|
+
config.crawler_domain = "http://mi.domain.com"
|
53
|
+
|
54
|
+
# By default crawler_delay is 5sec. This is the time between get each url
|
55
|
+
# To change it (seconds of delay):
|
56
|
+
# config.crawler_delay = x
|
47
57
|
end
|
48
58
|
```
|
59
|
+
|
60
|
+
## Crawler and cache Services
|
61
|
+
|
62
|
+
If have a front cache service, this services allow you to use your score sitemap to warmup this expire urls
|
63
|
+
|
64
|
+
When a page is loaded, score is update with current time and this told as when last cache occurs. Using it we can get all expired urls, and visit they to warm it up.
|
65
|
+
|
66
|
+
To configure it:
|
67
|
+
|
68
|
+
### On initializer define:
|
69
|
+
```
|
70
|
+
# Crawler needs a domain to mount urls to visit
|
71
|
+
config.crawler_domain = "http://mi.domain.com"
|
72
|
+
|
73
|
+
# By default crawler_delay is 5sec. This is the time between get each url
|
74
|
+
# To change it (seconds of delay):
|
75
|
+
# config.crawler_delay = x
|
76
|
+
```
|
77
|
+
|
78
|
+
With **CacheManager.uncached_urls(expiration_time: CacheExpirationTime, url_pattern: PATTERN)** we get all url not hitted on this time (all expired urls)
|
79
|
+
|
80
|
+
Example:
|
81
|
+
```
|
82
|
+
# Return urls not visited between 1.week.ago(setted on config.expiry_time) and 3.hours.ago
|
83
|
+
OrganicSitemap::CacheManager.uncached_urls(expiration_time: 3.hours)
|
84
|
+
|
85
|
+
# Return urls not visited between 1.week.ago(setted on config.expiry_time) and 3.hours.ago and contains "/test/" string
|
86
|
+
OrganicSitemap::CacheManager.uncached_urls(expiration_time: 3.hours, url_pattern: "/test/")
|
87
|
+
|
88
|
+
# Return urls not visited between 1.week.ago(setted on config.expiry_time) and 3.hours.ago and match ^\/test\/ regexp
|
89
|
+
OrganicSitemap::CacheManager.uncached_urls(expiration_time: 3.hours, url_pattern: /^\/test\//)
|
90
|
+
|
91
|
+
|
92
|
+
```
|
93
|
+
The with **CrawlerManager.warmup(urls)** we visit all this urls with a delay setted on configuration file (by default 5 sec). When we visit a url, *RedisManager* update score for this url and will be no more visited until not expire cache time
|
94
|
+
|
95
|
+
Example:
|
96
|
+
```
|
97
|
+
# For a 1.day page cache
|
98
|
+
CrawlerManager.warmup(CacheManager.uncached_urls(expiration_time: 3.hours))
|
99
|
+
```
|
100
|
+
|
49
101
|
## Rails config generator
|
50
102
|
|
51
103
|
Copy base config file on your Rails app by
|
@@ -19,4 +19,11 @@ OrganicSitemap.configure do |config|
|
|
19
19
|
# after this time, if no one load this page, it will be removed from the set.
|
20
20
|
# To change it (number of days):
|
21
21
|
# config.expiry_time = X
|
22
|
+
|
23
|
+
# Crawler needs a domain to mount urls to visit
|
24
|
+
config.crawler_domain = "http://mi.domain.com"
|
25
|
+
|
26
|
+
# By default crawler_delay is 5sec. This is the time between get each url
|
27
|
+
# To change it (seconds of delay):
|
28
|
+
# config.crawler_delay = x
|
22
29
|
end
|
data/lib/organic-sitemap.rb
CHANGED
@@ -2,7 +2,10 @@ require 'organic-sitemap/version'
|
|
2
2
|
require 'organic-sitemap/redis_manager'
|
3
3
|
require 'organic-sitemap/configuration'
|
4
4
|
require 'organic-sitemap/url_processor'
|
5
|
+
require 'organic-sitemap/cache_manager'
|
6
|
+
require 'organic-sitemap/crawler_manager'
|
5
7
|
require 'redis'
|
8
|
+
require 'httparty'
|
6
9
|
|
7
10
|
if defined? Rails
|
8
11
|
require 'organic-sitemap/railtie'
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module OrganicSitemap
|
2
|
+
class CacheManager
|
3
|
+
def self.uncached_urls(expiration_time:, url_pattern: "")
|
4
|
+
opts = {from: Time.now, to: to(expiration_time)}
|
5
|
+
urls = OrganicSitemap::RedisManager.sitemap_urls(opts)
|
6
|
+
urls.select{|x| x[url_pattern]}
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
def self.to(expiration_time)
|
11
|
+
Time.now + OrganicSitemap.configuration.expiry_time.days - expiration_time
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -11,6 +11,7 @@ module OrganicSitemap
|
|
11
11
|
class Configuration
|
12
12
|
attr_accessor :storage, :storage_key, :domains, :allowed_params,
|
13
13
|
:skipped_urls, :redis_connection, :expiry_time
|
14
|
+
attr_accessor :crawler_domain, :crawler_delay
|
14
15
|
|
15
16
|
def initialize
|
16
17
|
@storage = 'redis'
|
@@ -19,6 +20,7 @@ module OrganicSitemap
|
|
19
20
|
@skipped_urls = []
|
20
21
|
@redis_connection = Redis.new(url: 'redis://127.0.0.1:6379')
|
21
22
|
@expiry_time = 7
|
23
|
+
@crawler_delay = 5
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module OrganicSitemap
|
2
|
+
class CrawlerManager
|
3
|
+
def self.warmup(urls)
|
4
|
+
[*urls].each do |uri|
|
5
|
+
response = HTTParty.get(url_for(uri))
|
6
|
+
|
7
|
+
url_processed = OrganicSitemap::UrlProcessor.new(response.code, response.headers, response.request)
|
8
|
+
|
9
|
+
unless url_processed.url_from_cache_valid?
|
10
|
+
OrganicSitemap::RedisManager.remove_key(key: uri)
|
11
|
+
end
|
12
|
+
sleep crawler_delay
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def self.url_for(uri)
|
19
|
+
"#{OrganicSitemap.configuration.crawler_domain}#{uri}"
|
20
|
+
end
|
21
|
+
def self.crawler_delay
|
22
|
+
OrganicSitemap.configuration.crawler_delay
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -24,6 +24,10 @@ module OrganicSitemap
|
|
24
24
|
success_response? && html_page? && request.get? && is_expected_domain? && is_allowed_url?
|
25
25
|
end
|
26
26
|
|
27
|
+
def url_from_cache_valid?
|
28
|
+
success_response? && html_page? && is_expected_domain?
|
29
|
+
end
|
30
|
+
|
27
31
|
private
|
28
32
|
|
29
33
|
def success_response?
|
data/organic_sitemap.gemspec
CHANGED
@@ -20,9 +20,11 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
22
|
spec.add_dependency 'rack', '~> 1.0', '>= 1.0.0'
|
23
|
+
spec.add_dependency 'httparty'
|
23
24
|
|
24
25
|
spec.add_development_dependency "bundler", "~> 1.9"
|
25
|
-
spec.add_development_dependency "rspec"
|
26
|
+
spec.add_development_dependency "rspec"
|
27
|
+
spec.add_development_dependency "rspec-core"
|
26
28
|
spec.add_development_dependency "rake", "~> 10.0"
|
27
29
|
spec.add_development_dependency 'simplecov'
|
28
30
|
spec.add_development_dependency 'redis'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: organic-sitemap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kaskito
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|
@@ -30,6 +30,20 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.0.0
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: httparty
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
type: :runtime
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
33
47
|
- !ruby/object:Gem::Dependency
|
34
48
|
name: bundler
|
35
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -48,14 +62,28 @@ dependencies:
|
|
48
62
|
name: rspec
|
49
63
|
requirement: !ruby/object:Gem::Requirement
|
50
64
|
requirements:
|
51
|
-
- - "
|
65
|
+
- - ">="
|
52
66
|
- !ruby/object:Gem::Version
|
53
67
|
version: '0'
|
54
68
|
type: :development
|
55
69
|
prerelease: false
|
56
70
|
version_requirements: !ruby/object:Gem::Requirement
|
57
71
|
requirements:
|
58
|
-
- - "
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: rspec-core
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
type: :development
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
59
87
|
- !ruby/object:Gem::Version
|
60
88
|
version: '0'
|
61
89
|
- !ruby/object:Gem::Dependency
|
@@ -150,13 +178,14 @@ files:
|
|
150
178
|
- lib/generators/organic_sitemap/config_generator.rb
|
151
179
|
- lib/generators/organic_sitemap/templates/organic_sitemap.rb
|
152
180
|
- lib/organic-sitemap.rb
|
181
|
+
- lib/organic-sitemap/cache_manager.rb
|
153
182
|
- lib/organic-sitemap/configuration.rb
|
183
|
+
- lib/organic-sitemap/crawler_manager.rb
|
154
184
|
- lib/organic-sitemap/middleware/url_capture.rb
|
155
185
|
- lib/organic-sitemap/railtie.rb
|
156
186
|
- lib/organic-sitemap/redis_manager.rb
|
157
187
|
- lib/organic-sitemap/url_processor.rb
|
158
188
|
- lib/organic-sitemap/version.rb
|
159
|
-
- organic-sitemap-0.1.1.gem
|
160
189
|
- organic_sitemap.gemspec
|
161
190
|
homepage: https://github.com/abelardogilm/organic-sitemap/
|
162
191
|
licenses:
|
data/organic-sitemap-0.1.1.gem
DELETED
Binary file
|