urlarchiver 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/archiver.rb +74 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a1ab52e74bee05d257e62ee0e35a5d0cd92099c4
4
+ data.tar.gz: ced5a1dd308846cadb08973673addeaddb11c47f
5
+ SHA512:
6
+ metadata.gz: ecf480684ce7b304b8d1920b51e3878468f368273ab2150eb9d974e49d6f1697d2d987306d74ff2d8430742bbb0142a4bb717934f02958852184ed292152b447
7
+ data.tar.gz: 60c648e12b67f1fb9944c5918ec238481de839006187b40ebd0e246706d86846d718ef670931655233c450c6696e393ae0a18dcafe2e79ddd96bef877af2dc47
data/lib/archiver.rb ADDED
@@ -0,0 +1,74 @@
1
+ require 'nokogiri'
2
+ require 'pdfkit'
3
+ require 'open-uri'
4
+ require 'fileutils'
5
+ require 'json'
6
+
7
+ class Archiver
8
+ def initialize(type)
9
+ @type = type
10
+ end
11
+
12
+ # Archive a single url
13
+ def archiveone(url)
14
+ begin
15
+ html = Nokogiri::HTML(open(url))
16
+ url.gsub!("https", "http")
17
+ k = PDFKit.new(url, :page_size => "Letter")
18
+ pdf = k.to_pdf
19
+
20
+ # Save files
21
+ FileUtils::mkdir_p 'public/uploads/archive'
22
+ filepath = "public/uploads/archive/"+url.gsub("http", "https").gsub("/", "").gsub(".", "").gsub(":", "")+".pdf"
23
+ File.open(filepath.gsub(".pdf", ".html"), 'w') { |file| file.write(html) }
24
+ file = k.to_file(filepath)
25
+
26
+ # Save output if single url
27
+ if @type == "single"
28
+ @output = Hash.new
29
+ @output[:pdf_path] = filepath
30
+ @output[:html_path] = filepath.gsub(".pdf", ".html")
31
+ @output[:text] = html.text
32
+ end
33
+
34
+ # Return file paths
35
+ out = Hash.new
36
+ out[:pdf_path] = filepath
37
+ out[:html_path] = filepath.gsub(".pdf", ".html")
38
+ return out
39
+ rescue
40
+ end
41
+ end
42
+
43
+ # Archive multiple fields in a json
44
+ def multiarchive(json, field)
45
+ pjson = JSON.parse(json)
46
+ htmlfield = field+"_htmlpath"
47
+ pdffield = field+"_pdfpath"
48
+ @output = Array.new
49
+
50
+ pjson.each do |p|
51
+ pitem = Hash.new
52
+ paths = archiveone(p[field])
53
+
54
+ # Save paths
55
+ if !(paths == nil)
56
+ pitem[htmlfield] = paths[:pdf_path]
57
+ pitem[pdffield] = paths[:html_path]
58
+ end
59
+
60
+ # Save other fields
61
+ p.each do |key, value|
62
+ pitem[key] = value
63
+ end
64
+
65
+ @output.push(pitem)
66
+ end
67
+ end
68
+
69
+ # Generate JSON from output
70
+ def genJSON
71
+ JSON.pretty_generate(@output)
72
+ end
73
+ end
74
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: urlarchiver
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Saves html and pdfs of websites.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/archiver.rb
20
+ homepage: https://github.com/TransparencyToolkit/Archiver
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Archives websites.
44
+ test_files: []
45
+ has_rdoc: