RubyGems - sitetap - Versions diffs - 0.1.1 → 0.1.2 - Mend

sitetap 0.1.1 → 0.1.2

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 89d0090637f8b89550f043a3fbfc6d7094c4bfa9
-  data.tar.gz: f58e37144bb456e114d2c44957a11969ed47366e
+  metadata.gz: 54e85a540d37ca649e921379f6c45b4142713339
+  data.tar.gz: 4f1e3f07367b37a1fb21ff14ecb0780f4f4e962f
 SHA512:
-  metadata.gz: f0b5a22231d7239bdff707c26345654d5a60ce97c9fc63922abfb954f531532a8b58edb3a7451f6b661167a404a40efde8f53a59f40f61640a341a1b701a5384
-  data.tar.gz: 65843f3c0c823ee2ffbeff2f95d70b7f781899565961d7c0b00c5a0b03504cfd3162ad11674ceb188c64415c2049eb948341d757448b113ec04de91621f04700
+  metadata.gz: f2cc1b00ec4b37fc26facbf3c27c59595c6fdc5de8502424186fc320d985f550c28f986da30fbddce40c6891795ed51cfc56edade094764402de93f2fd743ee8
+  data.tar.gz: 0294b1b4d7dc0e458b3a06fac61ef5623d74b395c55a7e2c99209c765113987f193d662712561d47ea4729b76d887e1bc2d10c239299456a1783c477d02c146b

data/bin/sitetap CHANGED Viewed

@@ -3,12 +3,13 @@
 require 'sitetap/scraper'
 require 'sitetap/parser'
-url = ARGV[0]
+url       = ARGV[0]
+selector  = ARGV[1]
 if url.nil? || url == ''
   puts "Usage: sitetap [URL]"
   exit
 else
   scraper = Sitetap::Scraper.scrape!(url)
-  parser = Sitetap::Parser.parse!(scraper.dir)
+  parser = Sitetap::Parser.parse!(scraper.dir, selector)
 end

data/lib/sitetap/parser.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'nokogiri'
 require 'reverse_markdown'
 require 'fileutils'
+require 'sanitize'
 module Sitetap
   class Parser
@@ -9,12 +10,13 @@ module Sitetap
       @root = root_dir
     end
-    def self.parse!(root_dir)
-      parser = Sitetap::Parser.new(root_dir).parse!
+    def self.parse!(root_dir, selector = nil)
+      parser = Sitetap::Parser.new(root_dir).parse!(selector)
       parser
     end
-    def parse!
+    def parse!(selector = nil)
+      @selector = selector unless selector.nil?
       verify_directories
       do_the_loop
       self
@@ -78,24 +80,24 @@ module Sitetap
         # get the path of the file relative to the html
         # directory (scraped dir)
-        #
+        #
         file_path = file.gsub(/#{html_dir}\//, '')
         # clean the contents of the html file so we can work
         # with it
-        #
+        #
         contents = clean_html(file)
         # set the references to where the new files will
         # live
-        #
+        #
         tmp_file_path       = "#{tmp_dir}/#{file_path}"
         markdown_file_path  = "#{md_dir}/#{file_path}.md"
         text_file_path      = "#{txt_dir}/#{file_path}.txt"
         # find or create directories that will contain the
         # file
-        #
+        #
         verify_file_directories([
           tmp_file_path,
           markdown_file_path,
@@ -104,22 +106,22 @@ module Sitetap
         # write a temporary html file with the cleaned-up
         # contents
-        #
+        #
         write_file(tmp_file_path, contents)
         # now we hone in on the html contents and strip the
         # stuff we don't need
-        #
+        #
         adj_contents = filter_html(tmp_file_path)
         # convert the adjusted html to markdown and write it
         # to file
-        #
+        #
         write_file(markdown_file_path, html2markdown(adj_contents))
         # last, we remove all the tags and write the plain
         # text file
-        #
+        #
         write_file(text_file_path, strip_tags(adj_contents))
       end
@@ -145,23 +147,22 @@ module Sitetap
     end
     def filter_html(file_path)
-      contents = File.read(file_path, :encoding => 'ASCII')
+      contents = File.read(file_path, :encoding => 'UTF-8')
       page = Nokogiri::HTML(contents)
       content = page.css(selector).to_s
-      # content = page.css('body').to_s if content == ''
     end
     def strip_tags(html)
-      html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
-      html.gsub(/(\ \ )+/, "\n\n")
+      html = Sanitize.fragment(html)
+      html.gsub(/\n(\ )+/, "\n").gsub(/\ \ +/, "\n\n").gsub(/\n\n\n+/, "\n\n")
     end
     def html2markdown(html)
       ReverseMarkdown.convert(
-        html,
-        :unknown_tags => :bypass,
+        html,
+        :unknown_tags => :bypass,
         :github_flavored => true
-      )
+      ).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n")
     end
     # ------------------------------------ Writing Files

data/lib/sitetap/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Sitetap
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

data/sitetap.gemspec CHANGED Viewed

@@ -22,4 +22,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_runtime_dependency "nokogiri"
   spec.add_runtime_dependency "reverse_markdown"
+  spec.add_runtime_dependency "sanitize"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: sitetap
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Sean C Davis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-03-17 00:00:00.000000000 Z
+date: 2015-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -66,6 +66,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: sanitize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: ''
 email:
 - scdavis41@gmail.com
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.0
+rubygems_version: 2.4.6
 signing_key:
 specification_version: 4
 summary: Scrape content from a website.