sitemap_gen 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/sitemap-gen +18 -8
- data/lib/sitemap_gen/csv.rb +4 -2
- data/lib/sitemap_gen/version.rb +1 -1
- data/lib/sitemap_gen/xml_crawler.rb +39 -0
- data/lib/sitemap_gen.rb +5 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 649e1930c46318acffd879e2fc23ae60f0af1515
|
4
|
+
data.tar.gz: e615ccc032fba5feeb67ae93153e9b4de01ab79d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4800975b66f0e8c44b12d68636e8b91ff9610307c1768184f575596312b4ded444f72ef2e3d608c2da265e790cd4057185ee8d1b225f3cb3a759e60d0bb5d09a
|
7
|
+
data.tar.gz: e5be1903dd5e6f31050d8f289a349955f2a9e9d9ed6ab49b750d34adfe5bdcdc350c8be802bf9598ebf82e7b4d082e81702f2abf3d994b37fc6c2f6b07833026
|
data/bin/sitemap-gen
CHANGED
@@ -14,20 +14,28 @@ Usage: sitemap-gen [OPTION] PATH
|
|
14
14
|
Options:
|
15
15
|
EOS
|
16
16
|
|
17
|
-
opts.on('-
|
18
|
-
options[:
|
17
|
+
opts.on('-c', '--checking_url', 'Check url wether or not valid url') do |path|
|
18
|
+
options[:checking_url] = true
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on('-
|
22
|
-
options[:
|
21
|
+
opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
|
22
|
+
options[:format] = path
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
|
26
|
+
options[:input] = path
|
23
27
|
end
|
24
28
|
|
25
29
|
opts.on('-o', '--output [PATH]', 'Path to save output csv') do |path|
|
26
30
|
options[:output] = path
|
27
31
|
end
|
28
32
|
|
29
|
-
opts.on('-
|
30
|
-
options[:
|
33
|
+
opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
|
34
|
+
options[:base_url] = path
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on('-x', '--xml-path [PATH]', 'Path to xml file') do |path|
|
38
|
+
options[:xml_path] = path
|
31
39
|
end
|
32
40
|
|
33
41
|
opts.on('-h', '--help', 'Display information') do |help|
|
@@ -38,10 +46,12 @@ end.parse!
|
|
38
46
|
|
39
47
|
if options.key?(:input) && options.key?(:base_url)
|
40
48
|
if options.key?(:output)
|
41
|
-
SitemapGen.generate(options[:input], options[:base_url], options[:output])
|
49
|
+
SitemapGen.generate(options[:input], options[:base_url], options[:output], options[:checking_url])
|
42
50
|
exit
|
43
51
|
end
|
44
|
-
SitemapGen.generate(options[:input], options[:base_url])
|
52
|
+
SitemapGen.generate(options[:input], options[:base_url], nil, options[:checking_url])
|
45
53
|
elsif options.key?(:format)
|
46
54
|
SitemapGen.fix(options[:format])
|
55
|
+
elsif options.key?(:xml_path)
|
56
|
+
SitemapGen.crawl_xml(options[:xml_path], options[:output])
|
47
57
|
end
|
data/lib/sitemap_gen/csv.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
module SitemapGen
|
2
2
|
class CSV
|
3
|
-
def initialize(dir_path, base_url, save_path)
|
3
|
+
def initialize(dir_path, base_url, save_path, checking_url)
|
4
4
|
@dir_path = dir_path
|
5
5
|
@base_url = base_url
|
6
6
|
@save_path = save_path || Dir.pwd
|
7
|
+
@checking_url = checking_url
|
7
8
|
@max_level = 1
|
8
9
|
@html_files = Dir.glob("#{dir_path}/**/index.html").sort_by { |f| File.dirname(f) }
|
9
10
|
raise 'There is no index.html files in your directory' if @html_files.empty?
|
@@ -26,7 +27,8 @@ module SitemapGen
|
|
26
27
|
next if f =~ ::SitemapGen::IGNORE_DIRS_REGEX
|
27
28
|
page_url = @base_url + server_path(f)
|
28
29
|
p page_url
|
29
|
-
sitemaps.push({ url: page_url, levels: dir_levels(f),
|
30
|
+
sitemaps.push({ url: page_url, levels: dir_levels(f),
|
31
|
+
status: checking_url ? page_status(page_url) : '' })
|
30
32
|
end
|
31
33
|
p 'Finish generating url'
|
32
34
|
sitemaps
|
data/lib/sitemap_gen/version.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
module Enumerable
|
2
|
+
def with_multithread(thread_num)
|
3
|
+
queue = Queue.new
|
4
|
+
threads = (1..thread_num).map do
|
5
|
+
Thread.new do
|
6
|
+
until queue.empty?
|
7
|
+
begin
|
8
|
+
yield(queue.pop)
|
9
|
+
rescue Exception
|
10
|
+
nil
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
each { |v| queue << v }
|
17
|
+
threads.each { |t| t.join }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module SitemapGen
|
22
|
+
class XMLCrawler
|
23
|
+
def self.execute(xml_path, save_path)
|
24
|
+
save_path ||= Dir.pwd
|
25
|
+
xml = File.open(xml_path) { |f| Nokogiri::XML(f) }
|
26
|
+
links = xml.css('loc').map(&:content)
|
27
|
+
::CSV.open("#{save_path}/sitemap_only_link_title.csv", 'wb') do |csv|
|
28
|
+
csv << ['ID', 'Page title', 'URL']
|
29
|
+
links.with_multithread(8) do |link|
|
30
|
+
p link
|
31
|
+
res = Net::HTTP.get_response(URI(link))
|
32
|
+
html = Nokogiri::HTML(res.body)
|
33
|
+
title = html.css('head title')&.first&.content
|
34
|
+
csv << ['', title, link]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/sitemap_gen.rb
CHANGED
@@ -8,6 +8,7 @@ module SitemapGen
|
|
8
8
|
|
9
9
|
autoload :CSV, 'sitemap_gen/csv'
|
10
10
|
autoload :Fixer, 'sitemap_gen/fixer'
|
11
|
+
autoload :XMLCrawler, 'sitemap_gen/xml_crawler'
|
11
12
|
|
12
13
|
def self.generate(dir_path, base_url, save_path = nil)
|
13
14
|
CSV.new(dir_path, base_url, save_path).execute
|
@@ -16,4 +17,8 @@ module SitemapGen
|
|
16
17
|
def self.fix(dir_path)
|
17
18
|
Fixer.new(dir_path).execute
|
18
19
|
end
|
20
|
+
|
21
|
+
def self.crawl_xml(xml_path, save_path)
|
22
|
+
XMLCrawler.execute(xml_path, save_path)
|
23
|
+
end
|
19
24
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap_gen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minh Phan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -99,6 +99,7 @@ files:
|
|
99
99
|
- lib/sitemap_gen/csv.rb
|
100
100
|
- lib/sitemap_gen/fixer.rb
|
101
101
|
- lib/sitemap_gen/version.rb
|
102
|
+
- lib/sitemap_gen/xml_crawler.rb
|
102
103
|
homepage: https://github.com/1PACVietnam/sitemap-gen
|
103
104
|
licenses:
|
104
105
|
- MIT
|