sitemap_gen 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 81da27f56ebc54a651b617b993b7ff9168b87d77
4
- data.tar.gz: 6cd803976a7b020e36c04344b0cf6c91964b40b4
3
+ metadata.gz: 649e1930c46318acffd879e2fc23ae60f0af1515
4
+ data.tar.gz: e615ccc032fba5feeb67ae93153e9b4de01ab79d
5
5
  SHA512:
6
- metadata.gz: 9d6bf23cbc64ab4184c56bf9473d24ffac891234e38250b17d646de9734238582df7135683b9c6a255084598740372fb1f221fa475284f0ae4412fe886fb71f1
7
- data.tar.gz: 4c12cd59f0cb871466ead3b7727cba61ed1a820b848d9263325bad2c128f7e18cdf1dd2ad65330acafc1504653eab15f871d4debcbf1d17bce4f5795858fef47
6
+ metadata.gz: 4800975b66f0e8c44b12d68636e8b91ff9610307c1768184f575596312b4ded444f72ef2e3d608c2da265e790cd4057185ee8d1b225f3cb3a759e60d0bb5d09a
7
+ data.tar.gz: e5be1903dd5e6f31050d8f289a349955f2a9e9d9ed6ab49b750d34adfe5bdcdc350c8be802bf9598ebf82e7b4d082e81702f2abf3d994b37fc6c2f6b07833026
data/bin/sitemap-gen CHANGED
@@ -14,20 +14,28 @@ Usage: sitemap-gen [OPTION] PATH
14
14
  Options:
15
15
  EOS
16
16
 
17
- opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
18
- options[:input] = path
17
+ opts.on('-c', '--checking_url', 'Check url wether or not valid url') do |path|
18
+ options[:checking_url] = true
19
19
  end
20
20
 
21
- opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
22
- options[:base_url] = path
21
+ opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
22
+ options[:format] = path
23
+ end
24
+
25
+ opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
26
+ options[:input] = path
23
27
  end
24
28
 
25
29
  opts.on('-o', '--output [PATH]', 'Path to save output csv') do |path|
26
30
  options[:output] = path
27
31
  end
28
32
 
29
- opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
30
- options[:format] = path
33
+ opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
34
+ options[:base_url] = path
35
+ end
36
+
37
+ opts.on('-x', '--xml-path [PATH]', 'Path to xml file') do |path|
38
+ options[:xml_path] = path
31
39
  end
32
40
 
33
41
  opts.on('-h', '--help', 'Display information') do |help|
@@ -38,10 +46,12 @@ end.parse!
38
46
 
39
47
  if options.key?(:input) && options.key?(:base_url)
40
48
  if options.key?(:output)
41
- SitemapGen.generate(options[:input], options[:base_url], options[:output])
49
+ SitemapGen.generate(options[:input], options[:base_url], options[:output], options[:checking_url])
42
50
  exit
43
51
  end
44
- SitemapGen.generate(options[:input], options[:base_url])
52
+ SitemapGen.generate(options[:input], options[:base_url], nil, options[:checking_url])
45
53
  elsif options.key?(:format)
46
54
  SitemapGen.fix(options[:format])
55
+ elsif options.key?(:xml_path)
56
+ SitemapGen.crawl_xml(options[:xml_path], options[:output])
47
57
  end
@@ -1,9 +1,10 @@
1
1
  module SitemapGen
2
2
  class CSV
3
- def initialize(dir_path, base_url, save_path)
3
+ def initialize(dir_path, base_url, save_path, checking_url)
4
4
  @dir_path = dir_path
5
5
  @base_url = base_url
6
6
  @save_path = save_path || Dir.pwd
7
+ @checking_url = checking_url
7
8
  @max_level = 1
8
9
  @html_files = Dir.glob("#{dir_path}/**/index.html").sort_by { |f| File.dirname(f) }
9
10
  raise 'There is no index.html files in your directory' if @html_files.empty?
@@ -26,7 +27,8 @@ module SitemapGen
26
27
  next if f =~ ::SitemapGen::IGNORE_DIRS_REGEX
27
28
  page_url = @base_url + server_path(f)
28
29
  p page_url
29
- sitemaps.push({ url: page_url, levels: dir_levels(f), status: page_status(page_url) })
30
+ sitemaps.push({ url: page_url, levels: dir_levels(f),
31
+ status: checking_url ? page_status(page_url) : '' })
30
32
  end
31
33
  p 'Finish generating url'
32
34
  sitemaps
@@ -1,3 +1,3 @@
1
1
  module SitemapGen
2
- VERSION = '0.2.4'
2
+ VERSION = '0.2.5'
3
3
  end
@@ -0,0 +1,39 @@
1
+ module Enumerable
2
+ def with_multithread(thread_num)
3
+ queue = Queue.new
4
+ threads = (1..thread_num).map do
5
+ Thread.new do
6
+ until queue.empty?
7
+ begin
8
+ yield(queue.pop)
9
+ rescue Exception
10
+ nil
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ each { |v| queue << v }
17
+ threads.each { |t| t.join }
18
+ end
19
+ end
20
+
21
+ module SitemapGen
22
+ class XMLCrawler
23
+ def self.execute(xml_path, save_path)
24
+ save_path ||= Dir.pwd
25
+ xml = File.open(xml_path) { |f| Nokogiri::XML(f) }
26
+ links = xml.css('loc').map(&:content)
27
+ ::CSV.open("#{save_path}/sitemap_only_link_title.csv", 'wb') do |csv|
28
+ csv << ['ID', 'Page title', 'URL']
29
+ links.with_multithread(8) do |link|
30
+ p link
31
+ res = Net::HTTP.get_response(URI(link))
32
+ html = Nokogiri::HTML(res.body)
33
+ title = html.css('head title')&.first&.content
34
+ csv << ['', title, link]
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
data/lib/sitemap_gen.rb CHANGED
@@ -8,6 +8,7 @@ module SitemapGen
8
8
 
9
9
  autoload :CSV, 'sitemap_gen/csv'
10
10
  autoload :Fixer, 'sitemap_gen/fixer'
11
+ autoload :XMLCrawler, 'sitemap_gen/xml_crawler'
11
12
 
12
13
  def self.generate(dir_path, base_url, save_path = nil)
13
14
  CSV.new(dir_path, base_url, save_path).execute
@@ -16,4 +17,8 @@ module SitemapGen
16
17
  def self.fix(dir_path)
17
18
  Fixer.new(dir_path).execute
18
19
  end
20
+
21
+ def self.crawl_xml(xml_path, save_path)
22
+ XMLCrawler.execute(xml_path, save_path)
23
+ end
19
24
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap_gen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minh Phan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-18 00:00:00.000000000 Z
11
+ date: 2017-07-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -99,6 +99,7 @@ files:
99
99
  - lib/sitemap_gen/csv.rb
100
100
  - lib/sitemap_gen/fixer.rb
101
101
  - lib/sitemap_gen/version.rb
102
+ - lib/sitemap_gen/xml_crawler.rb
102
103
  homepage: https://github.com/1PACVietnam/sitemap-gen
103
104
  licenses:
104
105
  - MIT