RubyGems - urlarchiver - Versions diffs - 0.0.1 - Mend

urlarchiver 0.0.1

Files changed (3) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: a1ab52e74bee05d257e62ee0e35a5d0cd92099c4
+  data.tar.gz: ced5a1dd308846cadb08973673addeaddb11c47f
+SHA512:
+  metadata.gz: ecf480684ce7b304b8d1920b51e3878468f368273ab2150eb9d974e49d6f1697d2d987306d74ff2d8430742bbb0142a4bb717934f02958852184ed292152b447
+  data.tar.gz: 60c648e12b67f1fb9944c5918ec238481de839006187b40ebd0e246706d86846d718ef670931655233c450c6696e393ae0a18dcafe2e79ddd96bef877af2dc47

data/lib/archiver.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require 'nokogiri'
+require 'pdfkit'
+require 'open-uri'
+require 'fileutils'
+require 'json'
+class Archiver
+  def initialize(type)
+    @type = type
+  end
+  # Archive a single url
+  def archiveone(url)
+    begin
+      html = Nokogiri::HTML(open(url))
+      url.gsub!("https", "http")
+      k = PDFKit.new(url, :page_size => "Letter")
+      pdf = k.to_pdf
+      # Save files
+      FileUtils::mkdir_p 'public/uploads/archive'
+      filepath = "public/uploads/archive/"+url.gsub("http", "https").gsub("/", "").gsub(".", "").gsub(":", "")+".pdf"
+      File.open(filepath.gsub(".pdf", ".html"), 'w') { |file| file.write(html) }
+      file = k.to_file(filepath)
+      # Save output if single url
+      if @type == "single"
+        @output = Hash.new
+        @output[:pdf_path] = filepath
+        @output[:html_path] = filepath.gsub(".pdf", ".html")
+        @output[:text] = html.text
+      end
+      # Return file paths
+      out = Hash.new
+      out[:pdf_path] = filepath
+      out[:html_path] = filepath.gsub(".pdf", ".html")
+      return out
+    rescue
+    end
+  end
+  # Archive multiple fields in a json
+  def multiarchive(json, field)
+    pjson = JSON.parse(json)
+    htmlfield = field+"_htmlpath"
+    pdffield = field+"_pdfpath"
+    @output = Array.new
+    pjson.each do |p|
+      pitem = Hash.new
+      paths = archiveone(p[field])
+      # Save paths
+      if !(paths == nil)
+        pitem[htmlfield] = paths[:pdf_path]
+        pitem[pdffield] = paths[:html_path]
+      end
+      # Save other fields
+      p.each do |key, value|
+        pitem[key] = value
+      end
+      @output.push(pitem)
+    end
+  end
+  # Generate JSON from output
+  def genJSON
+    JSON.pretty_generate(@output)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,45 @@
+--- !ruby/object:Gem::Specification
+name: urlarchiver
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- M. C. McGrath
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-05-26 00:00:00.000000000 Z
+dependencies: []
+description: Saves html and pdfs of websites.
+email: shidash@shidash.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/archiver.rb
+homepage: https://github.com/TransparencyToolkit/Archiver
+licenses:
+- GPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.14
+signing_key:
+specification_version: 4
+summary: Archives websites.
+test_files: []
+has_rdoc: