urlarchiver 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/archiver.rb +74 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a1ab52e74bee05d257e62ee0e35a5d0cd92099c4
|
4
|
+
data.tar.gz: ced5a1dd308846cadb08973673addeaddb11c47f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ecf480684ce7b304b8d1920b51e3878468f368273ab2150eb9d974e49d6f1697d2d987306d74ff2d8430742bbb0142a4bb717934f02958852184ed292152b447
|
7
|
+
data.tar.gz: 60c648e12b67f1fb9944c5918ec238481de839006187b40ebd0e246706d86846d718ef670931655233c450c6696e393ae0a18dcafe2e79ddd96bef877af2dc47
|
data/lib/archiver.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'pdfkit'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
class Archiver
|
8
|
+
def initialize(type)
|
9
|
+
@type = type
|
10
|
+
end
|
11
|
+
|
12
|
+
# Archive a single url
|
13
|
+
def archiveone(url)
|
14
|
+
begin
|
15
|
+
html = Nokogiri::HTML(open(url))
|
16
|
+
url.gsub!("https", "http")
|
17
|
+
k = PDFKit.new(url, :page_size => "Letter")
|
18
|
+
pdf = k.to_pdf
|
19
|
+
|
20
|
+
# Save files
|
21
|
+
FileUtils::mkdir_p 'public/uploads/archive'
|
22
|
+
filepath = "public/uploads/archive/"+url.gsub("http", "https").gsub("/", "").gsub(".", "").gsub(":", "")+".pdf"
|
23
|
+
File.open(filepath.gsub(".pdf", ".html"), 'w') { |file| file.write(html) }
|
24
|
+
file = k.to_file(filepath)
|
25
|
+
|
26
|
+
# Save output if single url
|
27
|
+
if @type == "single"
|
28
|
+
@output = Hash.new
|
29
|
+
@output[:pdf_path] = filepath
|
30
|
+
@output[:html_path] = filepath.gsub(".pdf", ".html")
|
31
|
+
@output[:text] = html.text
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return file paths
|
35
|
+
out = Hash.new
|
36
|
+
out[:pdf_path] = filepath
|
37
|
+
out[:html_path] = filepath.gsub(".pdf", ".html")
|
38
|
+
return out
|
39
|
+
rescue
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Archive multiple fields in a json
|
44
|
+
def multiarchive(json, field)
|
45
|
+
pjson = JSON.parse(json)
|
46
|
+
htmlfield = field+"_htmlpath"
|
47
|
+
pdffield = field+"_pdfpath"
|
48
|
+
@output = Array.new
|
49
|
+
|
50
|
+
pjson.each do |p|
|
51
|
+
pitem = Hash.new
|
52
|
+
paths = archiveone(p[field])
|
53
|
+
|
54
|
+
# Save paths
|
55
|
+
if !(paths == nil)
|
56
|
+
pitem[htmlfield] = paths[:pdf_path]
|
57
|
+
pitem[pdffield] = paths[:html_path]
|
58
|
+
end
|
59
|
+
|
60
|
+
# Save other fields
|
61
|
+
p.each do |key, value|
|
62
|
+
pitem[key] = value
|
63
|
+
end
|
64
|
+
|
65
|
+
@output.push(pitem)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Generate JSON from output
|
70
|
+
def genJSON
|
71
|
+
JSON.pretty_generate(@output)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: urlarchiver
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-26 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Saves html and pdfs of websites.
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/archiver.rb
|
20
|
+
homepage: https://github.com/TransparencyToolkit/Archiver
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.0.14
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Archives websites.
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|