RubyGems - urlarchiver - Versions diffs - 0.0.1 - Mend

urlarchiver 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: a1ab52e74bee05d257e62ee0e35a5d0cd92099c4
+  data.tar.gz: ced5a1dd308846cadb08973673addeaddb11c47f
+SHA512:
+  metadata.gz: ecf480684ce7b304b8d1920b51e3878468f368273ab2150eb9d974e49d6f1697d2d987306d74ff2d8430742bbb0142a4bb717934f02958852184ed292152b447
+  data.tar.gz: 60c648e12b67f1fb9944c5918ec238481de839006187b40ebd0e246706d86846d718ef670931655233c450c6696e393ae0a18dcafe2e79ddd96bef877af2dc47

data/lib/archiver.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require 'nokogiri'
+require 'pdfkit'
+require 'open-uri'
+require 'fileutils'
+require 'json'
+class Archiver
+  def initialize(type)
+    @type = type
+  end
+  # Archive a single url
+  def archiveone(url)
+    begin
+      html = Nokogiri::HTML(open(url))
+      url.gsub!("https", "http")
+      k = PDFKit.new(url, :page_size => "Letter")
+      pdf = k.to_pdf
+      # Save files
+      FileUtils::mkdir_p 'public/uploads/archive'
+      filepath = "public/uploads/archive/"+url.gsub("http", "https").gsub("/", "").gsub(".", "").gsub(":", "")+".pdf"
+      File.open(filepath.gsub(".pdf", ".html"), 'w') { |file| file.write(html) }
+      file = k.to_file(filepath)
+      # Save output if single url
+      if @type == "single"
+        @output = Hash.new
+        @output[:pdf_path] = filepath
+        @output[:html_path] = filepath.gsub(".pdf", ".html")
+        @output[:text] = html.text
+      end
+      # Return file paths
+      out = Hash.new
+      out[:pdf_path] = filepath
+      out[:html_path] = filepath.gsub(".pdf", ".html")
+      return out
+    rescue
+    end
+  end
+  # Archive multiple fields in a json
+  def multiarchive(json, field)
+    pjson = JSON.parse(json)
+    htmlfield = field+"_htmlpath"
+    pdffield = field+"_pdfpath"
+    @output = Array.new
+    pjson.each do |p|
+      pitem = Hash.new
+      paths = archiveone(p[field])
+      # Save paths
+      if !(paths == nil)
+        pitem[htmlfield] = paths[:pdf_path]
+        pitem[pdffield] = paths[:html_path]
+      end
+      # Save other fields
+      p.each do |key, value|
+        pitem[key] = value
+      end
+      @output.push(pitem)
+    end
+  end
+  # Generate JSON from output
+  def genJSON
+    JSON.pretty_generate(@output)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,45 @@
+--- !ruby/object:Gem::Specification
+name: urlarchiver
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- M. C. McGrath
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-05-26 00:00:00.000000000 Z
+dependencies: []
+description: Saves html and pdfs of websites.
+email: shidash@shidash.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/archiver.rb
+homepage: https://github.com/TransparencyToolkit/Archiver
+licenses:
+- GPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.14
+signing_key:
+specification_version: 4
+summary: Archives websites.
+test_files: []
+has_rdoc: