RubyGems - sitetap - Versions diffs - 0.0.0 → 0.1.0 - Mend

sitetap 0.0.0 → 0.1.0

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 54aa226490e4ea05843234b47534780ce0514413
-  data.tar.gz: 6a2354d47a8fa7ebc1a4838e8ac3b394f6556abd
+  metadata.gz: 910630c447b621e63f65047fb027b385913d0a21
+  data.tar.gz: 6f2689ccdbfd4897d8e9ac8adce44452be392423
 SHA512:
-  metadata.gz: 01ce3d3194cf12fb9c66d30e3c1a3010bf80e41bd6f68725e6d98cce87bef32277d15a3b2e43c53a0f69c9b56393a4aab85ff9d74a28a6b3e25da89ec693e939
-  data.tar.gz: 07dd9f5e9e0653f06465bedbccf80242d442aac615c47746fd11103902ba0f206daf38f14257d1f8d5882b92deaab8d3e70b7e6e82c2cc42bdaef1f66cd8f017
+  metadata.gz: 5201c5d44db48541d76ca615a309968b0bd3956e9a50fb1d9eea2ea1f0e8674fbda53bb95f2dd8251a327451348930bdb86592e398d2ea032509a8caef96f267
+  data.tar.gz: 06a8d55f597e7c8289af49469d01b45f279cf6d271c3c29eba4cf38a72df0b791a8208e441ae26832ae6bc689608e2cb786d7f76442733bbedf09c8f0186d484

data/README.md CHANGED Viewed

@@ -1,10 +1,16 @@
-# Sitetap
+SiteTap
+==========
-TODO: Write a gem description
+SiteTap takes a home page URL and turns into into a packaged directory of:
-## Installation
+* html
+* plain text
+* markdown
-Add this line to your application's Gemfile:
+Installation
+----------
+To install this to a ruby project, add the following to your `Gemfile`:
 ```ruby
 gem 'sitetap'
@@ -12,17 +18,56 @@ gem 'sitetap'
 And then execute:
-    $ bundle
+```text
+$ bundle install
+```
+Or install it so you can run it globally:
+```text
+$ gem install sitetap
+```
+Usage
+----------
+Using SiteTap is quite simple. You just run the executable and give it a URL.
+```text
+$ sitetap [URL]
+```
+So, if I wanted to scrape Sapwood's website, I could do this:
+```text
+$ sitetap "http://sapwood.org/"
+```
+Within your current directory, this will create the following directory
+structure:
+```text
+- sapwood.org
+    - html
+    - markdown
+    - txt
+    - tmp
+```
+Within each are the converted files from the website.
-Or install it yourself as:
+Bugs
+----------
-    $ gem install sitetap
+Please [create an issue](https://github.com/seancdavis/sitetap/issues/new) if
+you encounter a bug.
-## Usage
+Contributing
+----------
-TODO: Write usage instructions here
+Missing a feature? Add it!
-## Contributing
+Found a bug? Fix it!
 1. Fork it ( https://github.com/[my-github-username]/sitetap/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)

data/bin/sitetap ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require 'sitetap/scraper'
+require 'sitetap/parser'
+url = ARGV[0]
+if url.nil? || url == ''
+  puts "Usage: sitetap [URL]"
+  exit
+else
+  scraper = Sitetap::Scraper.scrape!(url)
+  parser = Sitetap::Parser.parse!(scraper.dir)
+end

data/lib/sitetap/parser.rb ADDED Viewed

@@ -0,0 +1,174 @@
+require 'nokogiri'
+require 'reverse_markdown'
+require 'fileutils'
+module Sitetap
+  class Parser
+    def initialize(root_dir)
+      @root = root_dir
+    end
+    def self.parse!(root_dir)
+      parser = Sitetap::Parser.new(root_dir).parse!
+      parser
+    end
+    def parse!
+      verify_directories
+      do_the_loop
+      self
+    end
+    private
+    # ------------------------------------ References
+    def root
+      @root
+    end
+    def html_dir
+      @html_dir ||= "#{root}/html"
+    end
+    def tmp_dir
+      @tmp_dir ||= "#{root}/tmp"
+    end
+    def md_dir
+      @md_dir ||= "#{root}/markdown"
+    end
+    def txt_dir
+      @txt_dir ||= "#{root}/text"
+    end
+    def selector
+      @selector ||= "body"
+    end
+    def files
+      @files ||= Dir.glob("#{html_dir}/**/*.html")
+    end
+    # ------------------------------------ Directories
+    def mkdir_p(dir)
+      unless Dir.exists?(dir)
+        FileUtils.mkdir_p(dir)
+      end
+    end
+    def verify_directories
+      [tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
+    end
+    def verify_file_directories(files)
+      files.each do |file|
+        dir = file.split('/')[0..-2].join('/')
+        mkdir_p(dir)
+      end
+    end
+    # ------------------------------------ The Loop
+    def do_the_loop
+      files.each do |file|
+        # get the path of the file relative to the html
+        # directory (scraped dir)
+        #
+        file_path = file.gsub(/#{html_dir}\//, '')
+        # clean the contents of the html file so we can work
+        # with it
+        #
+        contents = clean_html(file)
+        # set the references to where the new files will
+        # live
+        #
+        tmp_file_path       = "#{tmp_dir}/#{file_path}"
+        markdown_file_path  = "#{md_dir}/#{file_path}.md"
+        text_file_path      = "#{txt_dir}/#{file_path}.txt"
+        # find or create directories that will contain the
+        # file
+        #
+        verify_file_directories([
+          tmp_file_path,
+          markdown_file_path,
+          text_file_path
+        ])
+        # write a temporary html file with the cleaned-up
+        # contents
+        #
+        write_file(tmp_file_path, contents)
+        # now we hone in on the html contents and strip the
+        # stuff we don't need
+        #
+        adj_contents = filter_html(tmp_file_path)
+        # convert the adjusted html to markdown and write it
+        # to file
+        #
+        write_file(markdown_file_path, html2markdown(adj_contents))
+        # last, we remove all the tags and write the plain
+        # text file
+        #
+        write_file(text_file_path, strip_tags(adj_contents))
+      end
+    end
+    # ------------------------------------ Parsing Actions
+    def clean_html(file)
+      File.read(file)
+        .encode('UTF-8', :invalid => :replace, :undef => :replace)
+        .split(' ')
+        .to_s
+        .gsub(/\\u0000/, '')
+        .split('", "')
+        .join(' ')
+        .gsub(/\\/, '')
+        .gsub(/\"\]/, '')
+        .gsub(/\[\"/, '')
+        .gsub(/[”“]/, '"')
+        .gsub(/[’]/, "'")
+        .gsub(/[é]/, 'e')
+        .gsub(/[–]/, '-')
+    end
+    def filter_html(file_path)
+      contents = File.read(file_path, :encoding => 'ASCII')
+      page = Nokogiri::HTML(contents)
+      content = page.css(selector).to_s
+      # content = page.css('body').to_s if content == ''
+    end
+    def strip_tags(html)
+      html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
+      html.gsub(/(\ \ )+/, "\n\n")
+    end
+    def html2markdown(html)
+      ReverseMarkdown.convert(
+        html,
+        :unknown_tags => :bypass,
+        :github_flavored => true
+      )
+    end
+    # ------------------------------------ Writing Files
+    def write_file(file_path, content)
+      File.open(file_path, 'w') { |file| file.write(content) }
+    end
+  end
+end

data/lib/sitetap/scraper.rb ADDED Viewed

@@ -0,0 +1,63 @@
+require 'fileutils'
+module Sitetap
+  class Scraper
+    def initialize(url)
+      @url = url.strip.gsub(/\/$/, '')
+    end
+    def self.scrape!(url)
+      scraper = Sitetap::Scraper.new(url)
+      scraper.scrape!
+      scraper
+    end
+    def scrape!
+      verify_dir
+      wget
+      self
+    end
+    def dir
+      root
+    end
+    private
+      def domain
+        @domain ||= @url.gsub(/http(s)?\:\/\//, '')
+      end
+      def root
+        @root ||= "#{Dir.pwd}/#{domain}"
+      end
+      def html_dir
+        "#{root}/html"
+      end
+      def verify_dir
+        unless Dir.exists?(html_dir)
+          FileUtils.mkdir_p(html_dir)
+        end
+      end
+      def wget_options
+        [
+          '--recursive',
+          '--page-requisites',
+          '--html-extension',
+          '--convert-links',
+          '--restrict-file-names=windows',
+          '--span-hosts'
+        ]
+      end
+      def wget
+        system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
+        # add `-o #{log_dir}/scrape.log` to store logfile
+      end
+  end
+end

data/lib/sitetap/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Sitetap
-  VERSION = "0.0.0"
+  VERSION = "0.1.0"
 end

data/sitetap.gemspec CHANGED Viewed

@@ -20,4 +20,6 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bundler", "~> 1.7"
   spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_runtime_dependency "nokogiri"
+  spec.add_runtime_dependency "reverse_markdown"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sitetap
 version: !ruby/object:Gem::Version
-  version: 0.0.0
+  version: 0.1.0
 platform: ruby
 authors:
 - Sean C Davis
@@ -38,10 +38,39 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: reverse_markdown
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: ''
 email:
 - scdavis41@gmail.com
-executables: []
+executables:
+- sitetap
 extensions: []
 extra_rdoc_files: []
 files:
@@ -50,7 +79,10 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- bin/sitetap
 - lib/sitetap.rb
+- lib/sitetap/parser.rb
+- lib/sitetap/scraper.rb
 - lib/sitetap/version.rb
 - sitetap.gemspec
 homepage: ''