urlarchiver 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/archiver.rb +74 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a1ab52e74bee05d257e62ee0e35a5d0cd92099c4
4
+ data.tar.gz: ced5a1dd308846cadb08973673addeaddb11c47f
5
+ SHA512:
6
+ metadata.gz: ecf480684ce7b304b8d1920b51e3878468f368273ab2150eb9d974e49d6f1697d2d987306d74ff2d8430742bbb0142a4bb717934f02958852184ed292152b447
7
+ data.tar.gz: 60c648e12b67f1fb9944c5918ec238481de839006187b40ebd0e246706d86846d718ef670931655233c450c6696e393ae0a18dcafe2e79ddd96bef877af2dc47
data/lib/archiver.rb ADDED
@@ -0,0 +1,74 @@
1
+ require 'nokogiri'
2
+ require 'pdfkit'
3
+ require 'open-uri'
4
+ require 'fileutils'
5
+ require 'json'
6
+
7
+ class Archiver
8
+ def initialize(type)
9
+ @type = type
10
+ end
11
+
12
+ # Archive a single url
13
+ def archiveone(url)
14
+ begin
15
+ html = Nokogiri::HTML(open(url))
16
+ url.gsub!("https", "http")
17
+ k = PDFKit.new(url, :page_size => "Letter")
18
+ pdf = k.to_pdf
19
+
20
+ # Save files
21
+ FileUtils::mkdir_p 'public/uploads/archive'
22
+ filepath = "public/uploads/archive/"+url.gsub("http", "https").gsub("/", "").gsub(".", "").gsub(":", "")+".pdf"
23
+ File.open(filepath.gsub(".pdf", ".html"), 'w') { |file| file.write(html) }
24
+ file = k.to_file(filepath)
25
+
26
+ # Save output if single url
27
+ if @type == "single"
28
+ @output = Hash.new
29
+ @output[:pdf_path] = filepath
30
+ @output[:html_path] = filepath.gsub(".pdf", ".html")
31
+ @output[:text] = html.text
32
+ end
33
+
34
+ # Return file paths
35
+ out = Hash.new
36
+ out[:pdf_path] = filepath
37
+ out[:html_path] = filepath.gsub(".pdf", ".html")
38
+ return out
39
+ rescue
40
+ end
41
+ end
42
+
43
+ # Archive multiple fields in a json
44
+ def multiarchive(json, field)
45
+ pjson = JSON.parse(json)
46
+ htmlfield = field+"_htmlpath"
47
+ pdffield = field+"_pdfpath"
48
+ @output = Array.new
49
+
50
+ pjson.each do |p|
51
+ pitem = Hash.new
52
+ paths = archiveone(p[field])
53
+
54
+ # Save paths
55
+ if !(paths == nil)
56
+ pitem[htmlfield] = paths[:pdf_path]
57
+ pitem[pdffield] = paths[:html_path]
58
+ end
59
+
60
+ # Save other fields
61
+ p.each do |key, value|
62
+ pitem[key] = value
63
+ end
64
+
65
+ @output.push(pitem)
66
+ end
67
+ end
68
+
69
+ # Generate JSON from output
70
+ def genJSON
71
+ JSON.pretty_generate(@output)
72
+ end
73
+ end
74
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: urlarchiver
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Saves html and pdfs of websites.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/archiver.rb
20
+ homepage: https://github.com/TransparencyToolkit/Archiver
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Archives websites.
44
+ test_files: []
45
+ has_rdoc: