sitemap_gen 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 81da27f56ebc54a651b617b993b7ff9168b87d77
4
- data.tar.gz: 6cd803976a7b020e36c04344b0cf6c91964b40b4
3
+ metadata.gz: 649e1930c46318acffd879e2fc23ae60f0af1515
4
+ data.tar.gz: e615ccc032fba5feeb67ae93153e9b4de01ab79d
5
5
  SHA512:
6
- metadata.gz: 9d6bf23cbc64ab4184c56bf9473d24ffac891234e38250b17d646de9734238582df7135683b9c6a255084598740372fb1f221fa475284f0ae4412fe886fb71f1
7
- data.tar.gz: 4c12cd59f0cb871466ead3b7727cba61ed1a820b848d9263325bad2c128f7e18cdf1dd2ad65330acafc1504653eab15f871d4debcbf1d17bce4f5795858fef47
6
+ metadata.gz: 4800975b66f0e8c44b12d68636e8b91ff9610307c1768184f575596312b4ded444f72ef2e3d608c2da265e790cd4057185ee8d1b225f3cb3a759e60d0bb5d09a
7
+ data.tar.gz: e5be1903dd5e6f31050d8f289a349955f2a9e9d9ed6ab49b750d34adfe5bdcdc350c8be802bf9598ebf82e7b4d082e81702f2abf3d994b37fc6c2f6b07833026
data/bin/sitemap-gen CHANGED
@@ -14,20 +14,28 @@ Usage: sitemap-gen [OPTION] PATH
14
14
  Options:
15
15
  EOS
16
16
 
17
- opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
18
- options[:input] = path
17
+ opts.on('-c', '--checking_url', 'Check url wether or not valid url') do |path|
18
+ options[:checking_url] = true
19
19
  end
20
20
 
21
- opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
22
- options[:base_url] = path
21
+ opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
22
+ options[:format] = path
23
+ end
24
+
25
+ opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
26
+ options[:input] = path
23
27
  end
24
28
 
25
29
  opts.on('-o', '--output [PATH]', 'Path to save output csv') do |path|
26
30
  options[:output] = path
27
31
  end
28
32
 
29
- opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
30
- options[:format] = path
33
+ opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
34
+ options[:base_url] = path
35
+ end
36
+
37
+ opts.on('-x', '--xml-path [PATH]', 'Path to xml file') do |path|
38
+ options[:xml_path] = path
31
39
  end
32
40
 
33
41
  opts.on('-h', '--help', 'Display information') do |help|
@@ -38,10 +46,12 @@ end.parse!
38
46
 
39
47
  if options.key?(:input) && options.key?(:base_url)
40
48
  if options.key?(:output)
41
- SitemapGen.generate(options[:input], options[:base_url], options[:output])
49
+ SitemapGen.generate(options[:input], options[:base_url], options[:output], options[:checking_url])
42
50
  exit
43
51
  end
44
- SitemapGen.generate(options[:input], options[:base_url])
52
+ SitemapGen.generate(options[:input], options[:base_url], nil, options[:checking_url])
45
53
  elsif options.key?(:format)
46
54
  SitemapGen.fix(options[:format])
55
+ elsif options.key?(:xml_path)
56
+ SitemapGen.crawl_xml(options[:xml_path], options[:output])
47
57
  end
@@ -1,9 +1,10 @@
1
1
  module SitemapGen
2
2
  class CSV
3
- def initialize(dir_path, base_url, save_path)
3
+ def initialize(dir_path, base_url, save_path, checking_url)
4
4
  @dir_path = dir_path
5
5
  @base_url = base_url
6
6
  @save_path = save_path || Dir.pwd
7
+ @checking_url = checking_url
7
8
  @max_level = 1
8
9
  @html_files = Dir.glob("#{dir_path}/**/index.html").sort_by { |f| File.dirname(f) }
9
10
  raise 'There is no index.html files in your directory' if @html_files.empty?
@@ -26,7 +27,8 @@ module SitemapGen
26
27
  next if f =~ ::SitemapGen::IGNORE_DIRS_REGEX
27
28
  page_url = @base_url + server_path(f)
28
29
  p page_url
29
- sitemaps.push({ url: page_url, levels: dir_levels(f), status: page_status(page_url) })
30
+ sitemaps.push({ url: page_url, levels: dir_levels(f),
31
+ status: checking_url ? page_status(page_url) : '' })
30
32
  end
31
33
  p 'Finish generating url'
32
34
  sitemaps
@@ -1,3 +1,3 @@
1
1
  module SitemapGen
2
- VERSION = '0.2.4'
2
+ VERSION = '0.2.5'
3
3
  end
@@ -0,0 +1,39 @@
1
+ module Enumerable
2
+ def with_multithread(thread_num)
3
+ queue = Queue.new
4
+ threads = (1..thread_num).map do
5
+ Thread.new do
6
+ until queue.empty?
7
+ begin
8
+ yield(queue.pop)
9
+ rescue Exception
10
+ nil
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ each { |v| queue << v }
17
+ threads.each { |t| t.join }
18
+ end
19
+ end
20
+
21
+ module SitemapGen
22
+ class XMLCrawler
23
+ def self.execute(xml_path, save_path)
24
+ save_path ||= Dir.pwd
25
+ xml = File.open(xml_path) { |f| Nokogiri::XML(f) }
26
+ links = xml.css('loc').map(&:content)
27
+ ::CSV.open("#{save_path}/sitemap_only_link_title.csv", 'wb') do |csv|
28
+ csv << ['ID', 'Page title', 'URL']
29
+ links.with_multithread(8) do |link|
30
+ p link
31
+ res = Net::HTTP.get_response(URI(link))
32
+ html = Nokogiri::HTML(res.body)
33
+ title = html.css('head title')&.first&.content
34
+ csv << ['', title, link]
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
data/lib/sitemap_gen.rb CHANGED
@@ -8,6 +8,7 @@ module SitemapGen
8
8
 
9
9
  autoload :CSV, 'sitemap_gen/csv'
10
10
  autoload :Fixer, 'sitemap_gen/fixer'
11
+ autoload :XMLCrawler, 'sitemap_gen/xml_crawler'
11
12
 
12
13
  def self.generate(dir_path, base_url, save_path = nil)
13
14
  CSV.new(dir_path, base_url, save_path).execute
@@ -16,4 +17,8 @@ module SitemapGen
16
17
  def self.fix(dir_path)
17
18
  Fixer.new(dir_path).execute
18
19
  end
20
+
21
+ def self.crawl_xml(xml_path, save_path)
22
+ XMLCrawler.execute(xml_path, save_path)
23
+ end
19
24
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap_gen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minh Phan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-18 00:00:00.000000000 Z
11
+ date: 2017-07-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -99,6 +99,7 @@ files:
99
99
  - lib/sitemap_gen/csv.rb
100
100
  - lib/sitemap_gen/fixer.rb
101
101
  - lib/sitemap_gen/version.rb
102
+ - lib/sitemap_gen/xml_crawler.rb
102
103
  homepage: https://github.com/1PACVietnam/sitemap-gen
103
104
  licenses:
104
105
  - MIT