webpage-archivist 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ require 'time'
2
+ require 'sequel'
3
+
4
+ module WebpageArchivist
5
+
6
+ DATABASE = ::Sequel.connect(ENV['DATABASE_URL'] || "sqlite://#{Dir.pwd}/webpage-archivist.sqlite3", :encoding => 'utf-8')
7
+
8
+ # Define the sequel migrations and run them at startup, code adapted from sinatra-sequel gem
9
+ class Migrations
10
+
11
+ @@migrations = []
12
+
13
+ def run
14
+ create_migrations_table
15
+ @@migrations.each do |migration|
16
+ if DATABASE[:migrations].filter(:name => migration[:name]).count == 0
17
+ p "Running migration: #{migration[:name]}"
18
+ DATABASE.transaction do
19
+ migration[:block].yield
20
+ DATABASE[:migrations] << {:name => migration[:name], :ran_at => Time.now}
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ def self.migration(name, &block)
27
+ @@migrations << {:name => name, :block => block}
28
+ end
29
+
30
+ migration 'create initial tables' do
31
+ DATABASE.create_table :webpages do
32
+ primary_key :id
33
+ String :name, :size => 250, :null => false, :index => true, :unique => true
34
+ String :uri, :size => 5000, :null => false
35
+ String :last_modified, :size => 50
36
+ File :last_content, :null => true
37
+ String :last_charset, :size => 20, :null => true
38
+ DateTime :created_at, :null => false, :index => true
39
+ end
40
+
41
+ DATABASE.create_table :instances do
42
+ primary_key :id
43
+ foreign_key :webpage_id, :webpages
44
+ String :commit_timestamp, :size => 25, :null => false
45
+ Boolean :snapshot, :default => false, :index => true
46
+ DateTime :created_at, :null => false, :index => true
47
+ end
48
+
49
+ DATABASE.create_table :images do
50
+ primary_key :id
51
+ foreign_key :webpage_id, :webpages
52
+ String :uri, :size => 5000, :null => false
53
+ String :last_modified, :size => 50, :null => false
54
+ String :extension, :size => 10, :null => false
55
+ String :file_hash, :size => 50, :null => false
56
+ DateTime :last_fetched, :null => false, :index => true
57
+ end
58
+
59
+ DATABASE.create_table :scripts do
60
+ primary_key :id
61
+ foreign_key :webpage_id, :webpages
62
+ String :uri, :size => 5000, :null => false
63
+ String :last_modified, :size => 50, :null => false
64
+ String :file_hash, :size => 50, :null => false
65
+ DateTime :last_fetched, :null => false, :index => true
66
+ end
67
+
68
+ DATABASE.create_table :stylesheets do
69
+ primary_key :id
70
+ foreign_key :webpage_id, :webpages
71
+ String :uri, :size => 5000, :null => false
72
+ String :last_modified, :size => 50, :null => false
73
+ String :file_hash, :size => 50, :null => false
74
+ DateTime :last_fetched, :null => false, :index => true
75
+ File :last_content, :null => true
76
+ String :last_charset, :size => 20, :null => true
77
+ end
78
+
79
+ end
80
+
81
+ def create_migrations_table
82
+ DATABASE.create_table? :migrations do
83
+ primary_key :id
84
+ String :name, :null => false, :index => true
85
+ timestamp :ran_at, :null => false
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ Migrations.new.run
92
+
93
+ end
@@ -0,0 +1,190 @@
1
+ require 'addressable/uri'
2
+ require 'grit'
3
+ require 'andand'
4
+ require 'mime/types'
5
+
6
+ Sequel::Model.plugin :validation_helpers
7
+ Sequel::Model.plugin :timestamps
8
+
9
+ module WebpageArchivist
10
+
11
+ class Webpage < Sequel::Model
12
+
13
+ ASSETS_PATH = File.expand_path(ENV['ARCHIVIST_ASSETS_PATH'] || './archivist_assets')
14
+ p "Archivist assets path is [#{ASSETS_PATH}]"
15
+ Dir.mkdir_if_not_exist ASSETS_PATH
16
+
17
+ one_to_many :instances
18
+ one_to_many :stylesheets
19
+ one_to_many :scripts
20
+ one_to_many :images
21
+
22
+ def index_path
23
+ File.join(repository_dir, 'index.html')
24
+ end
25
+
26
+ def save_content content
27
+ File.open(index_path, 'w') { |f| f.write(content) }
28
+ end
29
+
30
+ # The directory containing the git repository
31
+ def repository_dir
32
+ unless @repository_dir
33
+ @repository_dir = File.expand_path(File.join(ASSETS_PATH, self.id.to_s))
34
+ Dir.mkdir_if_not_exist @repository_dir
35
+ end
36
+ @repository_dir
37
+ end
38
+
39
+ def repository
40
+ unless @repository
41
+ if File.exist? "#{repository_dir}.git"
42
+ @repository = Grit::Repo.new(repository_dir)
43
+ else
44
+ Dir.mkdir_if_not_exist repository_dir
45
+ @repository = Grit::Repo.init(repository_dir)
46
+ end
47
+ end
48
+ @repository
49
+ end
50
+
51
+ def validate
52
+ super
53
+ validates_presence [:uri, :name]
54
+ validates_max_length 5000, :uri
55
+ validates_max_length 250, :name
56
+ validates_unique :name, :message => "[#{self.name}] is already taken"
57
+ if self.uri
58
+ begin
59
+ URI.parse self.uri
60
+ rescue URI::InvalidURIError
61
+ errors.add('uri', "[#{self.uri}] is not a valid uri")
62
+ end
63
+ end
64
+ end
65
+
66
+ def after_create
67
+ super
68
+ repository
69
+ end
70
+
71
+ # Update the repo and commit the changes
72
+ # files:: the files that should be in the repository
73
+ # message:: the commit message
74
+ def update_repo_commit_changes files, message
75
+ Dir.foreach(repository_dir) do |file|
76
+ unless file.start_with?('.') || ('index.html' == file) || files.include?(file)
77
+ File.delete File.join(repository_dir, file)
78
+ end
79
+ end
80
+ status = repository.status
81
+ repository.add status.untracked.keys
82
+ repository.add status.changed.keys
83
+ repository.remove status.deleted.keys
84
+ repository.commit_index message
85
+ end
86
+
87
+ end
88
+
89
+ class Instance < Sequel::Model
90
+
91
+ many_to_one :webpage
92
+
93
+ def validate
94
+ super
95
+ validates_presence [:webpage_id, :commit_timestamp]
96
+ end
97
+
98
+ end
99
+
100
+ module ElementWithContent
101
+
102
+ def save_content content
103
+ File.open(File.join(webpage.repository_dir, file_name), 'w') { |f| f.write(content) }
104
+ end
105
+
106
+ end
107
+
108
+ module WebpageElement
109
+ include ElementWithContent
110
+
111
+ def validate
112
+ super
113
+ validates_presence [:webpage_id, :uri, :last_fetched, :last_modified, :file_hash]
114
+ validates_max_length 5000, :uri
115
+ if self.uri
116
+ begin
117
+ URI.parse self.uri
118
+ rescue URI::InvalidURIError
119
+ errors.add('uri', "[#{self.uri}] is not a valid uri")
120
+ end
121
+ end
122
+ end
123
+
124
+ end
125
+
126
+ class Image < Sequel::Model
127
+ many_to_one :webpage
128
+
129
+ include WebpageElement
130
+
131
+ def self.compress
132
+ false
133
+ end
134
+
135
+ def validate
136
+ super
137
+ validates_max_length 10, :extension
138
+ end
139
+
140
+ def file_name
141
+ "#{file_hash}#{extension}"
142
+ end
143
+
144
+ def self.extention uri, content_type
145
+ extension = MIME::Types[content_type].andand[0].extensions.andand[0]
146
+ extension ? ".#{extension}" : File.extname(Addressable::URI.parse(uri).normalize.path)[0...10]
147
+ end
148
+
149
+ end
150
+
151
+ class Script < Sequel::Model
152
+ many_to_one :webpage
153
+
154
+ include WebpageElement
155
+
156
+ def file_name
157
+ "#{file_hash}.js"
158
+ end
159
+
160
+ def self.extention uri, content_type
161
+ '.js'
162
+ end
163
+
164
+ def extension= extension
165
+ # do nothing
166
+ end
167
+
168
+ end
169
+
170
+ class Stylesheet < Sequel::Model
171
+ many_to_one :webpage
172
+
173
+ include WebpageElement
174
+
175
+ def file_name
176
+ "#{file_hash}.css"
177
+ end
178
+
179
+ def self.extention uri, content_type
180
+ '.css'
181
+ end
182
+
183
+ def extension= extension
184
+ # do nothing
185
+ end
186
+
187
+ end
188
+
189
+ end
190
+
@@ -0,0 +1,63 @@
1
+ # Contains monkey patches
2
+ require 'addressable/uri'
3
+ require 'grit'
4
+ require 'mini_magick'
5
+
6
+ module Addressable
7
+
8
+ class URI
9
+
10
+ # Make an URI absolute
11
+ def absolutize uri
12
+ join(uri).normalize.to_s
13
+ end
14
+
15
+ end
16
+
17
+ end
18
+
19
+ #Patching the repo grit class to specify the work tree
20
+ class Grit::Repo
21
+
22
+ def add(*files)
23
+ self.git.add({:chdir => self.git.work_tree}, *files.flatten)
24
+ end
25
+
26
+ def remove(*files)
27
+ self.git.rm({:chdir => self.git.work_tree}, *files.flatten)
28
+ end
29
+
30
+ def commit_index(message)
31
+ self.git.commit({:chdir => self.git.work_tree}, '--allow-empty', '-m', message)
32
+ end
33
+
34
+ def archive_zip(id, file)
35
+ self.git.archive({:chdir => self.git.work_tree}, "--output=#{file}", id)
36
+ end
37
+
38
+ end
39
+
40
+ # Patching mini magic so we can specify the path to the image magick installation
41
+ module MiniMagick
42
+
43
+ class CommandBuilder
44
+
45
+ alias :parent_command :command
46
+
47
+ def command
48
+ "#{ENV['IMAGE_MAGICK_PATH'] ? "#{ENV['IMAGE_MAGICK_PATH']}/" : ''}#{parent_command}"
49
+ end
50
+
51
+ end
52
+ end
53
+
54
+ class Dir
55
+
56
+ # Create a dit if it does not exist
57
+ def self.mkdir_if_not_exist p1
58
+ unless Dir.exist? p1
59
+ Dir.mkdir p1
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,77 @@
1
+ require 'websnap'
2
+ require 'mini_magick'
3
+
4
+ module WebpageArchivist
5
+
6
+ # Snapshot the pages and create thumbnails
7
+ class Snapshoter
8
+
9
+ SNAPSHOTS_PATH = File.expand_path(ENV['ARCHIVIST_SNAPSHOTS_PATH'] || './archivist_snapshots')
10
+ p "Archivist snapshots path is [#{SNAPSHOTS_PATH}]"
11
+ Dir.mkdir_if_not_exist SNAPSHOTS_PATH
12
+
13
+ class << self
14
+ attr_accessor :width, :height, :format, :thumbnail_scale, :thumbnail_crop_width, :thumbnail_crop_height
15
+ end
16
+
17
+ Snapshoter.width = 1024
18
+
19
+ Snapshoter.height = 1536
20
+
21
+ Snapshoter.format = 'jpeg'
22
+
23
+ Snapshoter.thumbnail_crop_width = 1024
24
+
25
+ Snapshoter.thumbnail_crop_height = 768
26
+
27
+ Snapshoter.thumbnail_scale = 25
28
+
29
+ # Create a snapshot corresponding to an instance
30
+ # instance:: the instance
31
+ def self.snapshot_instance instance
32
+ dir_path = File.join(SNAPSHOTS_PATH, instance.webpage.id.to_s)
33
+ unless Dir.exist? dir_path
34
+ Dir.mkdir dir_path
35
+ end
36
+
37
+ snapshot_path = File.join(dir_path, "#{instance.id}.#{Snapshoter.format}")
38
+ thumbnail_path = File.join(dir_path, "#{instance.id.to_s}-small.#{Snapshoter.format}")
39
+ snapshot File.new(instance.webpage.index_path), snapshot_path, thumbnail_path
40
+ instance.update(:snapshot => true)
41
+ end
42
+
43
+ # Create a snapshot of a web page
44
+ # uri_or_file:: the uri of the file to snapshot
45
+ # snapshot_path:: path to the snapshot file
46
+ # thumbnail_path: path to the thumbnail (can be nil for no thumbnail)
47
+ def self.snapshot uri_or_file, snapshot_path, thumbnail_path = nil
48
+ ::WebpageArchivist.debug "Snapshot for [#{uri_or_file.kind_of?(File) ? uri_or_file.path : uri_or_file}] on [#{snapshot_path}]" if ::WebpageArchivist.log
49
+
50
+ if File.exists? snapshot_path
51
+ File.delete snapshot_path
52
+ end
53
+
54
+ snapper = WebSnap::Snapper.new(uri_or_file)
55
+ snapper.options.clear.merge!({'--height' => Snapshoter.height, '--width' => Snapshoter.width, '--format' => Snapshoter.format})
56
+ snapper.to_file(snapshot_path)
57
+
58
+ if thumbnail_path
59
+ ::WebpageArchivist.debug "Thumbnail of [#{snapshot_path}] on [#{thumbnail_path}]" if ::WebpageArchivist.log
60
+
61
+ if File.exists? thumbnail_path
62
+ File.delete thumbnail_path
63
+ end
64
+
65
+ img = MiniMagick::Image::open(snapshot_path)
66
+ img.combine_options do |c|
67
+ c.crop "#{Snapshoter.thumbnail_crop_width}x#{Snapshoter.thumbnail_crop_height}+0+0"
68
+ c.scale "#{Snapshoter.thumbnail_scale}%"
69
+ end
70
+ img.write thumbnail_path
71
+ end
72
+
73
+ end
74
+
75
+ end
76
+
77
+ end
@@ -0,0 +1,129 @@
1
+ require 'addressable/uri'
2
+ require 'css_parser'
3
+ require 'iconv'
4
+
5
+ module CssParser
6
+
7
+ # Replace original implementation with one using addressable
8
+ def self.convert_uris(css, base_uri)
9
+ return css.gsub(URI_RX) do
10
+ uri = $1.to_s
11
+ uri.gsub!(/["']+/, '')
12
+ "url('#{base_uri.absolutize(uri)}')"
13
+ end
14
+ end
15
+
16
+ # Make the declaration accessible
17
+ class RuleSet
18
+
19
+ attr_reader :declarations
20
+
21
+ end
22
+
23
+ class Parser
24
+
25
+ attr_reader :imports
26
+
27
+ # Add the imports first
28
+ def to_s(media_types = :all)
29
+ out = imports ? "#{imports.collect { |i| "@import url(\"#{i}\");" }.join("\n")}\n\n" : ''
30
+
31
+ each_selector(media_types) do |selectors, declarations, specificity|
32
+ out << "#{selectors} {\n#{declarations}\n}\n"
33
+ end
34
+ out
35
+ end
36
+
37
+ # Adapted from add_block! so the imports are listed instead of fetched
38
+ def add_block_archivist(block, options = {})
39
+ @imports ||= []
40
+
41
+ options = {:base_uri => nil, :base_dir => nil, :charset => nil, :media_types => :all, :only_media_types => :all}.merge(options)
42
+ options[:media_types] = [options[:media_types]].flatten
43
+ options[:only_media_types] = [options[:only_media_types]].flatten
44
+
45
+ block = cleanup_block(block)
46
+
47
+ if options[:base_uri] and @options[:absolute_paths]
48
+ block = CssParser.convert_uris(block, options[:base_uri])
49
+ end
50
+
51
+ # List @imported CSS
52
+ block.scan(RE_AT_IMPORT_RULE).each do |import_rule|
53
+ imports << import_rule[0].to_s.gsub(/['"]*/, '').strip
54
+ end
55
+
56
+ block.gsub!(RE_AT_IMPORT_RULE, '')
57
+
58
+ parse_block_into_rule_sets!(block, options)
59
+ end
60
+ end
61
+ end
62
+
63
+ module WebpageArchivist
64
+
65
+ # Wrapper around css_parser
66
+ class StylesheetDocument
67
+
68
+ CONVERTER = Iconv.new('UTF-8//IGNORE//TRANSLIT', 'ASCII//IGNORE//TRANSLIT')
69
+
70
+ attr_reader :parser, :charset
71
+
72
+ def initialize content, base_uri, charset = nil
73
+ @base_uri = base_uri
74
+ @parser = CssParser::Parser.new :import => false
75
+
76
+ @charset = charset
77
+ unless @charset
78
+ content = CONVERTER.iconv(content)
79
+ @charset = 'ASCII-8BIT'
80
+ end
81
+
82
+ parser.add_block_archivist(content, {:base_uri => base_uri, :charset => @charset})
83
+ @expanded = false
84
+ end
85
+
86
+ def expand_if_needed
87
+ unless @expanded
88
+ parser.each_rule_set do |rs|
89
+ rs.expand_background_shorthand!
90
+ end
91
+ @expanded = true
92
+ end
93
+ end
94
+
95
+ # Call a block for each import
96
+ # Block call parameter will be the import's uri
97
+ # if the block return something it will replace the uri
98
+ def each_import &block
99
+ parser.imports.collect! do |uri|
100
+ block.yield(uri) || uri
101
+ end
102
+ end
103
+
104
+ # Call a block for each image
105
+ # Block call parameter will be the image uri,
106
+ # if the block return something it will replace the uri
107
+ def each_image &block
108
+ expand_if_needed
109
+ parser.each_rule_set do |rs|
110
+ rs.declarations.each do |r|
111
+ value = r[1][:value]
112
+ if uri = /url\(['|"]([^']+)['|"]\)/i.match(value)
113
+ uri = uri[1]
114
+ if (uri = block.yield(uri))
115
+ r[1][:value] = "url(\"#{uri}\")"
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ # Get the css content
123
+ def to_css
124
+ parser.to_s
125
+ end
126
+
127
+ end
128
+
129
+ end