sitemap_gen 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitemap-gen +18 -8
- data/lib/sitemap_gen/csv.rb +4 -2
- data/lib/sitemap_gen/version.rb +1 -1
- data/lib/sitemap_gen/xml_crawler.rb +39 -0
- data/lib/sitemap_gen.rb +5 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 649e1930c46318acffd879e2fc23ae60f0af1515
|
4
|
+
data.tar.gz: e615ccc032fba5feeb67ae93153e9b4de01ab79d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4800975b66f0e8c44b12d68636e8b91ff9610307c1768184f575596312b4ded444f72ef2e3d608c2da265e790cd4057185ee8d1b225f3cb3a759e60d0bb5d09a
|
7
|
+
data.tar.gz: e5be1903dd5e6f31050d8f289a349955f2a9e9d9ed6ab49b750d34adfe5bdcdc350c8be802bf9598ebf82e7b4d082e81702f2abf3d994b37fc6c2f6b07833026
|
data/bin/sitemap-gen
CHANGED
@@ -14,20 +14,28 @@ Usage: sitemap-gen [OPTION] PATH
|
|
14
14
|
Options:
|
15
15
|
EOS
|
16
16
|
|
17
|
-
opts.on('-
|
18
|
-
options[:
|
17
|
+
opts.on('-c', '--checking_url', 'Check url wether or not valid url') do |path|
|
18
|
+
options[:checking_url] = true
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on('-
|
22
|
-
options[:
|
21
|
+
opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
|
22
|
+
options[:format] = path
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
|
26
|
+
options[:input] = path
|
23
27
|
end
|
24
28
|
|
25
29
|
opts.on('-o', '--output [PATH]', 'Path to save output csv') do |path|
|
26
30
|
options[:output] = path
|
27
31
|
end
|
28
32
|
|
29
|
-
opts.on('-
|
30
|
-
options[:
|
33
|
+
opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
|
34
|
+
options[:base_url] = path
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on('-x', '--xml-path [PATH]', 'Path to xml file') do |path|
|
38
|
+
options[:xml_path] = path
|
31
39
|
end
|
32
40
|
|
33
41
|
opts.on('-h', '--help', 'Display information') do |help|
|
@@ -38,10 +46,12 @@ end.parse!
|
|
38
46
|
|
39
47
|
if options.key?(:input) && options.key?(:base_url)
|
40
48
|
if options.key?(:output)
|
41
|
-
SitemapGen.generate(options[:input], options[:base_url], options[:output])
|
49
|
+
SitemapGen.generate(options[:input], options[:base_url], options[:output], options[:checking_url])
|
42
50
|
exit
|
43
51
|
end
|
44
|
-
SitemapGen.generate(options[:input], options[:base_url])
|
52
|
+
SitemapGen.generate(options[:input], options[:base_url], nil, options[:checking_url])
|
45
53
|
elsif options.key?(:format)
|
46
54
|
SitemapGen.fix(options[:format])
|
55
|
+
elsif options.key?(:xml_path)
|
56
|
+
SitemapGen.crawl_xml(options[:xml_path], options[:output])
|
47
57
|
end
|
data/lib/sitemap_gen/csv.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
module SitemapGen
|
2
2
|
class CSV
|
3
|
-
def initialize(dir_path, base_url, save_path)
|
3
|
+
def initialize(dir_path, base_url, save_path, checking_url)
|
4
4
|
@dir_path = dir_path
|
5
5
|
@base_url = base_url
|
6
6
|
@save_path = save_path || Dir.pwd
|
7
|
+
@checking_url = checking_url
|
7
8
|
@max_level = 1
|
8
9
|
@html_files = Dir.glob("#{dir_path}/**/index.html").sort_by { |f| File.dirname(f) }
|
9
10
|
raise 'There is no index.html files in your directory' if @html_files.empty?
|
@@ -26,7 +27,8 @@ module SitemapGen
|
|
26
27
|
next if f =~ ::SitemapGen::IGNORE_DIRS_REGEX
|
27
28
|
page_url = @base_url + server_path(f)
|
28
29
|
p page_url
|
29
|
-
sitemaps.push({ url: page_url, levels: dir_levels(f),
|
30
|
+
sitemaps.push({ url: page_url, levels: dir_levels(f),
|
31
|
+
status: checking_url ? page_status(page_url) : '' })
|
30
32
|
end
|
31
33
|
p 'Finish generating url'
|
32
34
|
sitemaps
|
data/lib/sitemap_gen/version.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
module Enumerable
|
2
|
+
def with_multithread(thread_num)
|
3
|
+
queue = Queue.new
|
4
|
+
threads = (1..thread_num).map do
|
5
|
+
Thread.new do
|
6
|
+
until queue.empty?
|
7
|
+
begin
|
8
|
+
yield(queue.pop)
|
9
|
+
rescue Exception
|
10
|
+
nil
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
each { |v| queue << v }
|
17
|
+
threads.each { |t| t.join }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module SitemapGen
|
22
|
+
class XMLCrawler
|
23
|
+
def self.execute(xml_path, save_path)
|
24
|
+
save_path ||= Dir.pwd
|
25
|
+
xml = File.open(xml_path) { |f| Nokogiri::XML(f) }
|
26
|
+
links = xml.css('loc').map(&:content)
|
27
|
+
::CSV.open("#{save_path}/sitemap_only_link_title.csv", 'wb') do |csv|
|
28
|
+
csv << ['ID', 'Page title', 'URL']
|
29
|
+
links.with_multithread(8) do |link|
|
30
|
+
p link
|
31
|
+
res = Net::HTTP.get_response(URI(link))
|
32
|
+
html = Nokogiri::HTML(res.body)
|
33
|
+
title = html.css('head title')&.first&.content
|
34
|
+
csv << ['', title, link]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/sitemap_gen.rb
CHANGED
@@ -8,6 +8,7 @@ module SitemapGen
|
|
8
8
|
|
9
9
|
autoload :CSV, 'sitemap_gen/csv'
|
10
10
|
autoload :Fixer, 'sitemap_gen/fixer'
|
11
|
+
autoload :XMLCrawler, 'sitemap_gen/xml_crawler'
|
11
12
|
|
12
13
|
def self.generate(dir_path, base_url, save_path = nil)
|
13
14
|
CSV.new(dir_path, base_url, save_path).execute
|
@@ -16,4 +17,8 @@ module SitemapGen
|
|
16
17
|
def self.fix(dir_path)
|
17
18
|
Fixer.new(dir_path).execute
|
18
19
|
end
|
20
|
+
|
21
|
+
def self.crawl_xml(xml_path, save_path)
|
22
|
+
XMLCrawler.execute(xml_path, save_path)
|
23
|
+
end
|
19
24
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap_gen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minh Phan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -99,6 +99,7 @@ files:
|
|
99
99
|
- lib/sitemap_gen/csv.rb
|
100
100
|
- lib/sitemap_gen/fixer.rb
|
101
101
|
- lib/sitemap_gen/version.rb
|
102
|
+
- lib/sitemap_gen/xml_crawler.rb
|
102
103
|
homepage: https://github.com/1PACVietnam/sitemap-gen
|
103
104
|
licenses:
|
104
105
|
- MIT
|