sitemap_gen 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitemap-gen +18 -8
- data/lib/sitemap_gen/csv.rb +4 -2
- data/lib/sitemap_gen/version.rb +1 -1
- data/lib/sitemap_gen/xml_crawler.rb +39 -0
- data/lib/sitemap_gen.rb +5 -0
- metadata +3 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 649e1930c46318acffd879e2fc23ae60f0af1515
         | 
| 4 | 
            +
              data.tar.gz: e615ccc032fba5feeb67ae93153e9b4de01ab79d
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 4800975b66f0e8c44b12d68636e8b91ff9610307c1768184f575596312b4ded444f72ef2e3d608c2da265e790cd4057185ee8d1b225f3cb3a759e60d0bb5d09a
         | 
| 7 | 
            +
              data.tar.gz: e5be1903dd5e6f31050d8f289a349955f2a9e9d9ed6ab49b750d34adfe5bdcdc350c8be802bf9598ebf82e7b4d082e81702f2abf3d994b37fc6c2f6b07833026
         | 
    
        data/bin/sitemap-gen
    CHANGED
    
    | @@ -14,20 +14,28 @@ Usage: sitemap-gen [OPTION] PATH | |
| 14 14 | 
             
            Options:
         | 
| 15 15 | 
             
              EOS
         | 
| 16 16 |  | 
| 17 | 
            -
              opts.on('- | 
| 18 | 
            -
                options[: | 
| 17 | 
            +
              opts.on('-c', '--checking_url', 'Check url wether or not valid url') do |path|
         | 
| 18 | 
            +
                options[:checking_url] = true
         | 
| 19 19 | 
             
              end
         | 
| 20 20 |  | 
| 21 | 
            -
              opts.on('- | 
| 22 | 
            -
                options[: | 
| 21 | 
            +
              opts.on('-f', '--format [PATH]', 'Path to dir that need to be formatted') do |path|
         | 
| 22 | 
            +
                options[:format] = path
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
              opts.on('-i', '--input [PATH]', 'Input directory that need to generate csv') do |path|
         | 
| 26 | 
            +
                options[:input] = path
         | 
| 23 27 | 
             
              end
         | 
| 24 28 |  | 
| 25 29 | 
             
              opts.on('-o', '--output [PATH]', 'Path to save output csv') do |path|
         | 
| 26 30 | 
             
                options[:output] = path
         | 
| 27 31 | 
             
              end
         | 
| 28 32 |  | 
| 29 | 
            -
              opts.on('- | 
| 30 | 
            -
                options[: | 
| 33 | 
            +
              opts.on('-u', '--base_url [PATH]', 'Base url of website') do |path|
         | 
| 34 | 
            +
                options[:base_url] = path
         | 
| 35 | 
            +
              end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
              opts.on('-x', '--xml-path [PATH]', 'Path to xml file') do |path|
         | 
| 38 | 
            +
                options[:xml_path] = path
         | 
| 31 39 | 
             
              end
         | 
| 32 40 |  | 
| 33 41 | 
             
              opts.on('-h', '--help', 'Display information') do |help|
         | 
| @@ -38,10 +46,12 @@ end.parse! | |
| 38 46 |  | 
| 39 47 | 
             
            if options.key?(:input) && options.key?(:base_url)
         | 
| 40 48 | 
             
              if options.key?(:output)
         | 
| 41 | 
            -
                SitemapGen.generate(options[:input], options[:base_url], options[:output])
         | 
| 49 | 
            +
                SitemapGen.generate(options[:input], options[:base_url], options[:output], options[:checking_url])
         | 
| 42 50 | 
             
                exit
         | 
| 43 51 | 
             
              end
         | 
| 44 | 
            -
              SitemapGen.generate(options[:input], options[:base_url])
         | 
| 52 | 
            +
              SitemapGen.generate(options[:input], options[:base_url], nil, options[:checking_url])
         | 
| 45 53 | 
             
            elsif options.key?(:format)
         | 
| 46 54 | 
             
              SitemapGen.fix(options[:format])
         | 
| 55 | 
            +
            elsif options.key?(:xml_path)
         | 
| 56 | 
            +
              SitemapGen.crawl_xml(options[:xml_path], options[:output])
         | 
| 47 57 | 
             
            end
         | 
    
        data/lib/sitemap_gen/csv.rb
    CHANGED
    
    | @@ -1,9 +1,10 @@ | |
| 1 1 | 
             
            module SitemapGen
         | 
| 2 2 | 
             
              class CSV
         | 
| 3 | 
            -
                def initialize(dir_path, base_url, save_path)
         | 
| 3 | 
            +
                def initialize(dir_path, base_url, save_path, checking_url)
         | 
| 4 4 | 
             
                  @dir_path = dir_path
         | 
| 5 5 | 
             
                  @base_url = base_url
         | 
| 6 6 | 
             
                  @save_path = save_path || Dir.pwd
         | 
| 7 | 
            +
                  @checking_url = checking_url
         | 
| 7 8 | 
             
                  @max_level = 1
         | 
| 8 9 | 
             
                  @html_files = Dir.glob("#{dir_path}/**/index.html").sort_by { |f| File.dirname(f) }
         | 
| 9 10 | 
             
                  raise 'There is no index.html files in your directory' if @html_files.empty?
         | 
| @@ -26,7 +27,8 @@ module SitemapGen | |
| 26 27 | 
             
                      next if f =~ ::SitemapGen::IGNORE_DIRS_REGEX
         | 
| 27 28 | 
             
                      page_url = @base_url + server_path(f)
         | 
| 28 29 | 
             
                      p page_url
         | 
| 29 | 
            -
                      sitemaps.push({ url: page_url, levels: dir_levels(f), | 
| 30 | 
            +
                      sitemaps.push({ url: page_url, levels: dir_levels(f),
         | 
| 31 | 
            +
                                      status: checking_url ? page_status(page_url) : '' })
         | 
| 30 32 | 
             
                    end
         | 
| 31 33 | 
             
                    p 'Finish generating url'
         | 
| 32 34 | 
             
                    sitemaps
         | 
    
        data/lib/sitemap_gen/version.rb
    CHANGED
    
    
| @@ -0,0 +1,39 @@ | |
| 1 | 
            +
            module Enumerable
         | 
| 2 | 
            +
              def with_multithread(thread_num)
         | 
| 3 | 
            +
                queue = Queue.new
         | 
| 4 | 
            +
                threads = (1..thread_num).map do
         | 
| 5 | 
            +
                  Thread.new do
         | 
| 6 | 
            +
                    until queue.empty?
         | 
| 7 | 
            +
                      begin
         | 
| 8 | 
            +
                        yield(queue.pop)
         | 
| 9 | 
            +
                      rescue Exception
         | 
| 10 | 
            +
                        nil
         | 
| 11 | 
            +
                      end
         | 
| 12 | 
            +
                    end
         | 
| 13 | 
            +
                  end
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                each { |v| queue << v }
         | 
| 17 | 
            +
                threads.each { |t| t.join }
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
            end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            module SitemapGen
         | 
| 22 | 
            +
              class XMLCrawler
         | 
| 23 | 
            +
                def self.execute(xml_path, save_path)
         | 
| 24 | 
            +
                  save_path ||= Dir.pwd
         | 
| 25 | 
            +
                  xml = File.open(xml_path) { |f| Nokogiri::XML(f) }
         | 
| 26 | 
            +
                  links = xml.css('loc').map(&:content)
         | 
| 27 | 
            +
                  ::CSV.open("#{save_path}/sitemap_only_link_title.csv", 'wb') do |csv|
         | 
| 28 | 
            +
                    csv << ['ID', 'Page title', 'URL']
         | 
| 29 | 
            +
                    links.with_multithread(8) do |link|
         | 
| 30 | 
            +
                      p link
         | 
| 31 | 
            +
                      res = Net::HTTP.get_response(URI(link))
         | 
| 32 | 
            +
                      html = Nokogiri::HTML(res.body)
         | 
| 33 | 
            +
                      title = html.css('head title')&.first&.content
         | 
| 34 | 
            +
                      csv << ['', title, link]
         | 
| 35 | 
            +
                    end
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
            end
         | 
    
        data/lib/sitemap_gen.rb
    CHANGED
    
    | @@ -8,6 +8,7 @@ module SitemapGen | |
| 8 8 |  | 
| 9 9 | 
             
              autoload :CSV, 'sitemap_gen/csv'
         | 
| 10 10 | 
             
              autoload :Fixer, 'sitemap_gen/fixer'
         | 
| 11 | 
            +
              autoload :XMLCrawler, 'sitemap_gen/xml_crawler'
         | 
| 11 12 |  | 
| 12 13 | 
             
              def self.generate(dir_path, base_url, save_path = nil)
         | 
| 13 14 | 
             
                CSV.new(dir_path, base_url, save_path).execute
         | 
| @@ -16,4 +17,8 @@ module SitemapGen | |
| 16 17 | 
             
              def self.fix(dir_path)
         | 
| 17 18 | 
             
                Fixer.new(dir_path).execute
         | 
| 18 19 | 
             
              end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              def self.crawl_xml(xml_path, save_path)
         | 
| 22 | 
            +
                XMLCrawler.execute(xml_path, save_path)
         | 
| 23 | 
            +
              end
         | 
| 19 24 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: sitemap_gen
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.2. | 
| 4 | 
            +
              version: 0.2.5
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Minh Phan
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2017-07- | 
| 11 | 
            +
            date: 2017-07-26 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: nokogiri
         | 
| @@ -99,6 +99,7 @@ files: | |
| 99 99 | 
             
            - lib/sitemap_gen/csv.rb
         | 
| 100 100 | 
             
            - lib/sitemap_gen/fixer.rb
         | 
| 101 101 | 
             
            - lib/sitemap_gen/version.rb
         | 
| 102 | 
            +
            - lib/sitemap_gen/xml_crawler.rb
         | 
| 102 103 | 
             
            homepage: https://github.com/1PACVietnam/sitemap-gen
         | 
| 103 104 | 
             
            licenses:
         | 
| 104 105 | 
             
            - MIT
         |