webpage-archivist 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,93 @@
1
+ require 'time'
2
+ require 'sequel'
3
+
4
+ module WebpageArchivist
5
+
6
+ DATABASE = ::Sequel.connect(ENV['DATABASE_URL'] || "sqlite://#{Dir.pwd}/webpage-archivist.sqlite3", :encoding => 'utf-8')
7
+
8
+ # Define the sequel migrations and run them at startup, code adapted from sinatra-sequel gem
9
+ class Migrations
10
+
11
+ @@migrations = []
12
+
13
+ def run
14
+ create_migrations_table
15
+ @@migrations.each do |migration|
16
+ if DATABASE[:migrations].filter(:name => migration[:name]).count == 0
17
+ p "Running migration: #{migration[:name]}"
18
+ DATABASE.transaction do
19
+ migration[:block].yield
20
+ DATABASE[:migrations] << {:name => migration[:name], :ran_at => Time.now}
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ def self.migration(name, &block)
27
+ @@migrations << {:name => name, :block => block}
28
+ end
29
+
30
+ migration 'create initial tables' do
31
+ DATABASE.create_table :webpages do
32
+ primary_key :id
33
+ String :name, :size => 250, :null => false, :index => true, :unique => true
34
+ String :uri, :size => 5000, :null => false
35
+ String :last_modified, :size => 50
36
+ File :last_content, :null => true
37
+ String :last_charset, :size => 20, :null => true
38
+ DateTime :created_at, :null => false, :index => true
39
+ end
40
+
41
+ DATABASE.create_table :instances do
42
+ primary_key :id
43
+ foreign_key :webpage_id, :webpages
44
+ String :commit_timestamp, :size => 25, :null => false
45
+ Boolean :snapshot, :default => false, :index => true
46
+ DateTime :created_at, :null => false, :index => true
47
+ end
48
+
49
+ DATABASE.create_table :images do
50
+ primary_key :id
51
+ foreign_key :webpage_id, :webpages
52
+ String :uri, :size => 5000, :null => false
53
+ String :last_modified, :size => 50, :null => false
54
+ String :extension, :size => 10, :null => false
55
+ String :file_hash, :size => 50, :null => false
56
+ DateTime :last_fetched, :null => false, :index => true
57
+ end
58
+
59
+ DATABASE.create_table :scripts do
60
+ primary_key :id
61
+ foreign_key :webpage_id, :webpages
62
+ String :uri, :size => 5000, :null => false
63
+ String :last_modified, :size => 50, :null => false
64
+ String :file_hash, :size => 50, :null => false
65
+ DateTime :last_fetched, :null => false, :index => true
66
+ end
67
+
68
+ DATABASE.create_table :stylesheets do
69
+ primary_key :id
70
+ foreign_key :webpage_id, :webpages
71
+ String :uri, :size => 5000, :null => false
72
+ String :last_modified, :size => 50, :null => false
73
+ String :file_hash, :size => 50, :null => false
74
+ DateTime :last_fetched, :null => false, :index => true
75
+ File :last_content, :null => true
76
+ String :last_charset, :size => 20, :null => true
77
+ end
78
+
79
+ end
80
+
81
+ def create_migrations_table
82
+ DATABASE.create_table? :migrations do
83
+ primary_key :id
84
+ String :name, :null => false, :index => true
85
+ timestamp :ran_at, :null => false
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ Migrations.new.run
92
+
93
+ end
@@ -0,0 +1,190 @@
1
+ require 'addressable/uri'
2
+ require 'grit'
3
+ require 'andand'
4
+ require 'mime/types'
5
+
6
+ Sequel::Model.plugin :validation_helpers
7
+ Sequel::Model.plugin :timestamps
8
+
9
+ module WebpageArchivist
10
+
11
+ class Webpage < Sequel::Model
12
+
13
+ ASSETS_PATH = File.expand_path(ENV['ARCHIVIST_ASSETS_PATH'] || './archivist_assets')
14
+ p "Archivist assets path is [#{ASSETS_PATH}]"
15
+ Dir.mkdir_if_not_exist ASSETS_PATH
16
+
17
+ one_to_many :instances
18
+ one_to_many :stylesheets
19
+ one_to_many :scripts
20
+ one_to_many :images
21
+
22
+ def index_path
23
+ File.join(repository_dir, 'index.html')
24
+ end
25
+
26
+ def save_content content
27
+ File.open(index_path, 'w') { |f| f.write(content) }
28
+ end
29
+
30
+ # The directory containing the git repository
31
+ def repository_dir
32
+ unless @repository_dir
33
+ @repository_dir = File.expand_path(File.join(ASSETS_PATH, self.id.to_s))
34
+ Dir.mkdir_if_not_exist @repository_dir
35
+ end
36
+ @repository_dir
37
+ end
38
+
39
+ def repository
40
+ unless @repository
41
+ if File.exist? "#{repository_dir}.git"
42
+ @repository = Grit::Repo.new(repository_dir)
43
+ else
44
+ Dir.mkdir_if_not_exist repository_dir
45
+ @repository = Grit::Repo.init(repository_dir)
46
+ end
47
+ end
48
+ @repository
49
+ end
50
+
51
+ def validate
52
+ super
53
+ validates_presence [:uri, :name]
54
+ validates_max_length 5000, :uri
55
+ validates_max_length 250, :name
56
+ validates_unique :name, :message => "[#{self.name}] is already taken"
57
+ if self.uri
58
+ begin
59
+ URI.parse self.uri
60
+ rescue URI::InvalidURIError
61
+ errors.add('uri', "[#{self.uri}] is not a valid uri")
62
+ end
63
+ end
64
+ end
65
+
66
+ def after_create
67
+ super
68
+ repository
69
+ end
70
+
71
+ # Update the repo and commit the changes
72
+ # files:: the files that should be in the repository
73
+ # message:: the commit message
74
+ def update_repo_commit_changes files, message
75
+ Dir.foreach(repository_dir) do |file|
76
+ unless file.start_with?('.') || ('index.html' == file) || files.include?(file)
77
+ File.delete File.join(repository_dir, file)
78
+ end
79
+ end
80
+ status = repository.status
81
+ repository.add status.untracked.keys
82
+ repository.add status.changed.keys
83
+ repository.remove status.deleted.keys
84
+ repository.commit_index message
85
+ end
86
+
87
+ end
88
+
89
+ class Instance < Sequel::Model
90
+
91
+ many_to_one :webpage
92
+
93
+ def validate
94
+ super
95
+ validates_presence [:webpage_id, :commit_timestamp]
96
+ end
97
+
98
+ end
99
+
100
+ module ElementWithContent
101
+
102
+ def save_content content
103
+ File.open(File.join(webpage.repository_dir, file_name), 'w') { |f| f.write(content) }
104
+ end
105
+
106
+ end
107
+
108
+ module WebpageElement
109
+ include ElementWithContent
110
+
111
+ def validate
112
+ super
113
+ validates_presence [:webpage_id, :uri, :last_fetched, :last_modified, :file_hash]
114
+ validates_max_length 5000, :uri
115
+ if self.uri
116
+ begin
117
+ URI.parse self.uri
118
+ rescue URI::InvalidURIError
119
+ errors.add('uri', "[#{self.uri}] is not a valid uri")
120
+ end
121
+ end
122
+ end
123
+
124
+ end
125
+
126
+ class Image < Sequel::Model
127
+ many_to_one :webpage
128
+
129
+ include WebpageElement
130
+
131
+ def self.compress
132
+ false
133
+ end
134
+
135
+ def validate
136
+ super
137
+ validates_max_length 10, :extension
138
+ end
139
+
140
+ def file_name
141
+ "#{file_hash}#{extension}"
142
+ end
143
+
144
+ def self.extention uri, content_type
145
+ extension = MIME::Types[content_type].andand[0].extensions.andand[0]
146
+ extension ? ".#{extension}" : File.extname(Addressable::URI.parse(uri).normalize.path)[0...10]
147
+ end
148
+
149
+ end
150
+
151
+ class Script < Sequel::Model
152
+ many_to_one :webpage
153
+
154
+ include WebpageElement
155
+
156
+ def file_name
157
+ "#{file_hash}.js"
158
+ end
159
+
160
+ def self.extention uri, content_type
161
+ '.js'
162
+ end
163
+
164
+ def extension= extension
165
+ # do nothing
166
+ end
167
+
168
+ end
169
+
170
+ class Stylesheet < Sequel::Model
171
+ many_to_one :webpage
172
+
173
+ include WebpageElement
174
+
175
+ def file_name
176
+ "#{file_hash}.css"
177
+ end
178
+
179
+ def self.extention uri, content_type
180
+ '.css'
181
+ end
182
+
183
+ def extension= extension
184
+ # do nothing
185
+ end
186
+
187
+ end
188
+
189
+ end
190
+
@@ -0,0 +1,63 @@
1
+ # Contains monkey patches
2
+ require 'addressable/uri'
3
+ require 'grit'
4
+ require 'mini_magick'
5
+
6
+ module Addressable
7
+
8
+ class URI
9
+
10
+ # Make an URI absolute
11
+ def absolutize uri
12
+ join(uri).normalize.to_s
13
+ end
14
+
15
+ end
16
+
17
+ end
18
+
19
+ #Patching the repo grit class to specify the work tree
20
+ class Grit::Repo
21
+
22
+ def add(*files)
23
+ self.git.add({:chdir => self.git.work_tree}, *files.flatten)
24
+ end
25
+
26
+ def remove(*files)
27
+ self.git.rm({:chdir => self.git.work_tree}, *files.flatten)
28
+ end
29
+
30
+ def commit_index(message)
31
+ self.git.commit({:chdir => self.git.work_tree}, '--allow-empty', '-m', message)
32
+ end
33
+
34
+ def archive_zip(id, file)
35
+ self.git.archive({:chdir => self.git.work_tree}, "--output=#{file}", id)
36
+ end
37
+
38
+ end
39
+
40
+ # Patching mini magic so we can specify the path to the image magick installation
41
+ module MiniMagick
42
+
43
+ class CommandBuilder
44
+
45
+ alias :parent_command :command
46
+
47
+ def command
48
+ "#{ENV['IMAGE_MAGICK_PATH'] ? "#{ENV['IMAGE_MAGICK_PATH']}/" : ''}#{parent_command}"
49
+ end
50
+
51
+ end
52
+ end
53
+
54
+ class Dir
55
+
56
+ # Create a dit if it does not exist
57
+ def self.mkdir_if_not_exist p1
58
+ unless Dir.exist? p1
59
+ Dir.mkdir p1
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,77 @@
1
+ require 'websnap'
2
+ require 'mini_magick'
3
+
4
+ module WebpageArchivist
5
+
6
+ # Snapshot the pages and create thumbnails
7
+ class Snapshoter
8
+
9
+ SNAPSHOTS_PATH = File.expand_path(ENV['ARCHIVIST_SNAPSHOTS_PATH'] || './archivist_snapshots')
10
+ p "Archivist snapshots path is [#{SNAPSHOTS_PATH}]"
11
+ Dir.mkdir_if_not_exist SNAPSHOTS_PATH
12
+
13
+ class << self
14
+ attr_accessor :width, :height, :format, :thumbnail_scale, :thumbnail_crop_width, :thumbnail_crop_height
15
+ end
16
+
17
+ Snapshoter.width = 1024
18
+
19
+ Snapshoter.height = 1536
20
+
21
+ Snapshoter.format = 'jpeg'
22
+
23
+ Snapshoter.thumbnail_crop_width = 1024
24
+
25
+ Snapshoter.thumbnail_crop_height = 768
26
+
27
+ Snapshoter.thumbnail_scale = 25
28
+
29
+ # Create a snapshot corresponding to an instance
30
+ # instance:: the instance
31
+ def self.snapshot_instance instance
32
+ dir_path = File.join(SNAPSHOTS_PATH, instance.webpage.id.to_s)
33
+ unless Dir.exist? dir_path
34
+ Dir.mkdir dir_path
35
+ end
36
+
37
+ snapshot_path = File.join(dir_path, "#{instance.id}.#{Snapshoter.format}")
38
+ thumbnail_path = File.join(dir_path, "#{instance.id.to_s}-small.#{Snapshoter.format}")
39
+ snapshot File.new(instance.webpage.index_path), snapshot_path, thumbnail_path
40
+ instance.update(:snapshot => true)
41
+ end
42
+
43
+ # Create a snapshot of a web page
44
+ # uri_or_file:: the uri of the file to snapshot
45
+ # snapshot_path:: path to the snapshot file
46
+ # thumbnail_path: path to the thumbnail (can be nil for no thumbnail)
47
+ def self.snapshot uri_or_file, snapshot_path, thumbnail_path = nil
48
+ ::WebpageArchivist.debug "Snapshot for [#{uri_or_file.kind_of?(File) ? uri_or_file.path : uri_or_file}] on [#{snapshot_path}]" if ::WebpageArchivist.log
49
+
50
+ if File.exists? snapshot_path
51
+ File.delete snapshot_path
52
+ end
53
+
54
+ snapper = WebSnap::Snapper.new(uri_or_file)
55
+ snapper.options.clear.merge!({'--height' => Snapshoter.height, '--width' => Snapshoter.width, '--format' => Snapshoter.format})
56
+ snapper.to_file(snapshot_path)
57
+
58
+ if thumbnail_path
59
+ ::WebpageArchivist.debug "Thumbnail of [#{snapshot_path}] on [#{thumbnail_path}]" if ::WebpageArchivist.log
60
+
61
+ if File.exists? thumbnail_path
62
+ File.delete thumbnail_path
63
+ end
64
+
65
+ img = MiniMagick::Image::open(snapshot_path)
66
+ img.combine_options do |c|
67
+ c.crop "#{Snapshoter.thumbnail_crop_width}x#{Snapshoter.thumbnail_crop_height}+0+0"
68
+ c.scale "#{Snapshoter.thumbnail_scale}%"
69
+ end
70
+ img.write thumbnail_path
71
+ end
72
+
73
+ end
74
+
75
+ end
76
+
77
+ end
@@ -0,0 +1,129 @@
1
+ require 'addressable/uri'
2
+ require 'css_parser'
3
+ require 'iconv'
4
+
5
+ module CssParser
6
+
7
+ # Replace original implementation with one using addressable
8
+ def self.convert_uris(css, base_uri)
9
+ return css.gsub(URI_RX) do
10
+ uri = $1.to_s
11
+ uri.gsub!(/["']+/, '')
12
+ "url('#{base_uri.absolutize(uri)}')"
13
+ end
14
+ end
15
+
16
+ # Make the declaration accessible
17
+ class RuleSet
18
+
19
+ attr_reader :declarations
20
+
21
+ end
22
+
23
+ class Parser
24
+
25
+ attr_reader :imports
26
+
27
+ # Add the imports first
28
+ def to_s(media_types = :all)
29
+ out = imports ? "#{imports.collect { |i| "@import url(\"#{i}\");" }.join("\n")}\n\n" : ''
30
+
31
+ each_selector(media_types) do |selectors, declarations, specificity|
32
+ out << "#{selectors} {\n#{declarations}\n}\n"
33
+ end
34
+ out
35
+ end
36
+
37
+ # Adapted from add_block! so the imports are listed instead of fetched
38
+ def add_block_archivist(block, options = {})
39
+ @imports ||= []
40
+
41
+ options = {:base_uri => nil, :base_dir => nil, :charset => nil, :media_types => :all, :only_media_types => :all}.merge(options)
42
+ options[:media_types] = [options[:media_types]].flatten
43
+ options[:only_media_types] = [options[:only_media_types]].flatten
44
+
45
+ block = cleanup_block(block)
46
+
47
+ if options[:base_uri] and @options[:absolute_paths]
48
+ block = CssParser.convert_uris(block, options[:base_uri])
49
+ end
50
+
51
+ # List @imported CSS
52
+ block.scan(RE_AT_IMPORT_RULE).each do |import_rule|
53
+ imports << import_rule[0].to_s.gsub(/['"]*/, '').strip
54
+ end
55
+
56
+ block.gsub!(RE_AT_IMPORT_RULE, '')
57
+
58
+ parse_block_into_rule_sets!(block, options)
59
+ end
60
+ end
61
+ end
62
+
63
+ module WebpageArchivist
64
+
65
+ # Wrapper around css_parser
66
+ class StylesheetDocument
67
+
68
+ CONVERTER = Iconv.new('UTF-8//IGNORE//TRANSLIT', 'ASCII//IGNORE//TRANSLIT')
69
+
70
+ attr_reader :parser, :charset
71
+
72
+ def initialize content, base_uri, charset = nil
73
+ @base_uri = base_uri
74
+ @parser = CssParser::Parser.new :import => false
75
+
76
+ @charset = charset
77
+ unless @charset
78
+ content = CONVERTER.iconv(content)
79
+ @charset = 'ASCII-8BIT'
80
+ end
81
+
82
+ parser.add_block_archivist(content, {:base_uri => base_uri, :charset => @charset})
83
+ @expanded = false
84
+ end
85
+
86
+ def expand_if_needed
87
+ unless @expanded
88
+ parser.each_rule_set do |rs|
89
+ rs.expand_background_shorthand!
90
+ end
91
+ @expanded = true
92
+ end
93
+ end
94
+
95
+ # Call a block for each import
96
+ # Block call parameter will be the import's uri
97
+ # if the block return something it will replace the uri
98
+ def each_import &block
99
+ parser.imports.collect! do |uri|
100
+ block.yield(uri) || uri
101
+ end
102
+ end
103
+
104
+ # Call a block for each image
105
+ # Block call parameter will be the image uri,
106
+ # if the block return something it will replace the uri
107
+ def each_image &block
108
+ expand_if_needed
109
+ parser.each_rule_set do |rs|
110
+ rs.declarations.each do |r|
111
+ value = r[1][:value]
112
+ if uri = /url\(['|"]([^']+)['|"]\)/i.match(value)
113
+ uri = uri[1]
114
+ if (uri = block.yield(uri))
115
+ r[1][:value] = "url(\"#{uri}\")"
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ # Get the css content
123
+ def to_css
124
+ parser.to_s
125
+ end
126
+
127
+ end
128
+
129
+ end