RubyGems - sitetap - Versions diffs - 0.0.0 → 0.1.0 - Mend

sitetap 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 54aa226490e4ea05843234b47534780ce0514413
-  data.tar.gz: 6a2354d47a8fa7ebc1a4838e8ac3b394f6556abd
+  metadata.gz: 910630c447b621e63f65047fb027b385913d0a21
+  data.tar.gz: 6f2689ccdbfd4897d8e9ac8adce44452be392423
 SHA512:
-  metadata.gz: 01ce3d3194cf12fb9c66d30e3c1a3010bf80e41bd6f68725e6d98cce87bef32277d15a3b2e43c53a0f69c9b56393a4aab85ff9d74a28a6b3e25da89ec693e939
-  data.tar.gz: 07dd9f5e9e0653f06465bedbccf80242d442aac615c47746fd11103902ba0f206daf38f14257d1f8d5882b92deaab8d3e70b7e6e82c2cc42bdaef1f66cd8f017
+  metadata.gz: 5201c5d44db48541d76ca615a309968b0bd3956e9a50fb1d9eea2ea1f0e8674fbda53bb95f2dd8251a327451348930bdb86592e398d2ea032509a8caef96f267
+  data.tar.gz: 06a8d55f597e7c8289af49469d01b45f279cf6d271c3c29eba4cf38a72df0b791a8208e441ae26832ae6bc689608e2cb786d7f76442733bbedf09c8f0186d484

data/README.md CHANGED Viewed

@@ -1,10 +1,16 @@
-# Sitetap
+SiteTap
+==========
-TODO: Write a gem description
+SiteTap takes a home page URL and turns into into a packaged directory of:
-## Installation
+* html
+* plain text
+* markdown
-Add this line to your application's Gemfile:
+Installation
+----------
+To install this to a ruby project, add the following to your `Gemfile`:
 ```ruby
 gem 'sitetap'
@@ -12,17 +18,56 @@ gem 'sitetap'
 And then execute:
-    $ bundle
+```text
+$ bundle install
+```
+Or install it so you can run it globally:
+```text
+$ gem install sitetap
+```
+Usage
+----------
+Using SiteTap is quite simple. You just run the executable and give it a URL.
+```text
+$ sitetap [URL]
+```
+So, if I wanted to scrape Sapwood's website, I could do this:
+```text
+$ sitetap "http://sapwood.org/"
+```
+Within your current directory, this will create the following directory
+structure:
+```text
+- sapwood.org
+    - html
+    - markdown
+    - txt
+    - tmp
+```
+Within each are the converted files from the website.
-Or install it yourself as:
+Bugs
+----------
-    $ gem install sitetap
+Please [create an issue](https://github.com/seancdavis/sitetap/issues/new) if
+you encounter a bug.
-## Usage
+Contributing
+----------
-TODO: Write usage instructions here
+Missing a feature? Add it!
-## Contributing
+Found a bug? Fix it!
 1. Fork it ( https://github.com/[my-github-username]/sitetap/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)

data/bin/sitetap ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require 'sitetap/scraper'
+require 'sitetap/parser'
+url = ARGV[0]
+if url.nil? || url == ''
+  puts "Usage: sitetap [URL]"
+  exit
+else
+  scraper = Sitetap::Scraper.scrape!(url)
+  parser = Sitetap::Parser.parse!(scraper.dir)
+end

data/lib/sitetap/parser.rb ADDED Viewed

@@ -0,0 +1,174 @@
+require 'nokogiri'
+require 'reverse_markdown'
+require 'fileutils'
+module Sitetap
+  class Parser
+    def initialize(root_dir)
+      @root = root_dir
+    end
+    def self.parse!(root_dir)
+      parser = Sitetap::Parser.new(root_dir).parse!
+      parser
+    end
+    def parse!
+      verify_directories
+      do_the_loop
+      self
+    end
+    private
+    # ------------------------------------ References
+    def root
+      @root
+    end
+    def html_dir
+      @html_dir ||= "#{root}/html"
+    end
+    def tmp_dir
+      @tmp_dir ||= "#{root}/tmp"
+    end
+    def md_dir
+      @md_dir ||= "#{root}/markdown"
+    end
+    def txt_dir
+      @txt_dir ||= "#{root}/text"
+    end
+    def selector
+      @selector ||= "body"
+    end
+    def files
+      @files ||= Dir.glob("#{html_dir}/**/*.html")
+    end
+    # ------------------------------------ Directories
+    def mkdir_p(dir)
+      unless Dir.exists?(dir)
+        FileUtils.mkdir_p(dir)
+      end
+    end
+    def verify_directories
+      [tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
+    end
+    def verify_file_directories(files)
+      files.each do |file|
+        dir = file.split('/')[0..-2].join('/')
+        mkdir_p(dir)
+      end
+    end
+    # ------------------------------------ The Loop
+    def do_the_loop
+      files.each do |file|
+        # get the path of the file relative to the html
+        # directory (scraped dir)
+        #
+        file_path = file.gsub(/#{html_dir}\//, '')
+        # clean the contents of the html file so we can work
+        # with it
+        #
+        contents = clean_html(file)
+        # set the references to where the new files will
+        # live
+        #
+        tmp_file_path       = "#{tmp_dir}/#{file_path}"
+        markdown_file_path  = "#{md_dir}/#{file_path}.md"
+        text_file_path      = "#{txt_dir}/#{file_path}.txt"
+        # find or create directories that will contain the
+        # file
+        #
+        verify_file_directories([
+          tmp_file_path,
+          markdown_file_path,
+          text_file_path
+        ])
+        # write a temporary html file with the cleaned-up
+        # contents
+        #
+        write_file(tmp_file_path, contents)
+        # now we hone in on the html contents and strip the
+        # stuff we don't need
+        #
+        adj_contents = filter_html(tmp_file_path)
+        # convert the adjusted html to markdown and write it
+        # to file
+        #
+        write_file(markdown_file_path, html2markdown(adj_contents))
+        # last, we remove all the tags and write the plain
+        # text file
+        #
+        write_file(text_file_path, strip_tags(adj_contents))
+      end
+    end
+    # ------------------------------------ Parsing Actions
+    def clean_html(file)
+      File.read(file)
+        .encode('UTF-8', :invalid => :replace, :undef => :replace)
+        .split(' ')
+        .to_s
+        .gsub(/\\u0000/, '')
+        .split('", "')
+        .join(' ')
+        .gsub(/\\/, '')
+        .gsub(/\"\]/, '')
+        .gsub(/\[\"/, '')
+        .gsub(/[”“]/, '"')
+        .gsub(/[’]/, "'")
+        .gsub(/[é]/, 'e')
+        .gsub(/[–]/, '-')
+    end
+    def filter_html(file_path)
+      contents = File.read(file_path, :encoding => 'ASCII')
+      page = Nokogiri::HTML(contents)
+      content = page.css(selector).to_s
+      # content = page.css('body').to_s if content == ''
+    end
+    def strip_tags(html)
+      html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
+      html.gsub(/(\ \ )+/, "\n\n")
+    end
+    def html2markdown(html)
+      ReverseMarkdown.convert(
+        html,
+        :unknown_tags => :bypass,
+        :github_flavored => true
+      )
+    end
+    # ------------------------------------ Writing Files
+    def write_file(file_path, content)
+      File.open(file_path, 'w') { |file| file.write(content) }
+    end
+  end
+end

data/lib/sitetap/scraper.rb ADDED Viewed

@@ -0,0 +1,63 @@
+require 'fileutils'
+module Sitetap
+  class Scraper
+    def initialize(url)
+      @url = url.strip.gsub(/\/$/, '')
+    end
+    def self.scrape!(url)
+      scraper = Sitetap::Scraper.new(url)
+      scraper.scrape!
+      scraper
+    end
+    def scrape!
+      verify_dir
+      wget
+      self
+    end
+    def dir
+      root
+    end
+    private
+      def domain
+        @domain ||= @url.gsub(/http(s)?\:\/\//, '')
+      end
+      def root
+        @root ||= "#{Dir.pwd}/#{domain}"
+      end
+      def html_dir
+        "#{root}/html"
+      end
+      def verify_dir
+        unless Dir.exists?(html_dir)
+          FileUtils.mkdir_p(html_dir)
+        end
+      end
+      def wget_options
+        [
+          '--recursive',
+          '--page-requisites',
+          '--html-extension',
+          '--convert-links',
+          '--restrict-file-names=windows',
+          '--span-hosts'
+        ]
+      end
+      def wget
+        system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
+        # add `-o #{log_dir}/scrape.log` to store logfile
+      end
+  end
+end

data/lib/sitetap/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Sitetap
-  VERSION = "0.0.0"
+  VERSION = "0.1.0"
 end

data/sitetap.gemspec CHANGED Viewed

@@ -20,4 +20,6 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bundler", "~> 1.7"
   spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_runtime_dependency "nokogiri"
+  spec.add_runtime_dependency "reverse_markdown"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sitetap
 version: !ruby/object:Gem::Version
-  version: 0.0.0
+  version: 0.1.0
 platform: ruby
 authors:
 - Sean C Davis
@@ -38,10 +38,39 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: reverse_markdown
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: ''
 email:
 - scdavis41@gmail.com
-executables: []
+executables:
+- sitetap
 extensions: []
 extra_rdoc_files: []
 files:
@@ -50,7 +79,10 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- bin/sitetap
 - lib/sitetap.rb
+- lib/sitetap/parser.rb
+- lib/sitetap/scraper.rb
 - lib/sitetap/version.rb
 - sitetap.gemspec
 homepage: ''