pedophile 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a88a4801a4b0f5e66827ee308edee32de2f9882b
4
+ data.tar.gz: 31a68e039b567c0085d810000e959071322ec6ad
5
+ SHA512:
6
+ metadata.gz: cdcc88116835bc2678a3f8f4c28cac9767c2037bf2cf9ba7e4e83eda05c128b90f08920d90a7f2de1f53185aa8a0fa081b0b8ecf881f6d985347c66bed377c73
7
+ data.tar.gz: eb284fdaedd82df541ca2248280eb030778467f5339f068f41d942c6f381e3ce73bff2b2e59a77c1e1ed18dd26edb5adcc9c2e39865930615f5d9908e79dfd20
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem 'activesupport'
4
+ gem 'colorize'
5
+
6
+ group :development do
7
+ gem "rspec"
8
+ gem "bundler"
9
+ gem "jeweler"
10
+ gem "simplecov"
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,90 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (4.2.0)
5
+ i18n (~> 0.7)
6
+ json (~> 1.7, >= 1.7.7)
7
+ minitest (~> 5.1)
8
+ thread_safe (~> 0.3, >= 0.3.4)
9
+ tzinfo (~> 1.1)
10
+ addressable (2.3.7)
11
+ builder (3.2.2)
12
+ colorize (0.7.5)
13
+ descendants_tracker (0.0.4)
14
+ thread_safe (~> 0.3, >= 0.3.1)
15
+ diff-lcs (1.2.5)
16
+ docile (1.1.5)
17
+ faraday (0.9.1)
18
+ multipart-post (>= 1.2, < 3)
19
+ git (1.2.9.1)
20
+ github_api (0.12.3)
21
+ addressable (~> 2.3)
22
+ descendants_tracker (~> 0.0.4)
23
+ faraday (~> 0.8, < 0.10)
24
+ hashie (>= 3.3)
25
+ multi_json (>= 1.7.5, < 2.0)
26
+ nokogiri (~> 1.6.3)
27
+ oauth2
28
+ hashie (3.4.0)
29
+ highline (1.7.1)
30
+ i18n (0.7.0)
31
+ jeweler (2.0.1)
32
+ builder
33
+ bundler (>= 1.0)
34
+ git (>= 1.2.5)
35
+ github_api
36
+ highline (>= 1.6.15)
37
+ nokogiri (>= 1.5.10)
38
+ rake
39
+ rdoc
40
+ json (1.8.2)
41
+ jwt (1.3.0)
42
+ mini_portile (0.6.2)
43
+ minitest (5.5.1)
44
+ multi_json (1.10.1)
45
+ multi_xml (0.5.5)
46
+ multipart-post (2.0.0)
47
+ nokogiri (1.6.6.2)
48
+ mini_portile (~> 0.6.0)
49
+ oauth2 (1.0.0)
50
+ faraday (>= 0.8, < 0.10)
51
+ jwt (~> 1.0)
52
+ multi_json (~> 1.3)
53
+ multi_xml (~> 0.5)
54
+ rack (~> 1.2)
55
+ rack (1.6.0)
56
+ rake (10.4.2)
57
+ rdoc (4.2.0)
58
+ json (~> 1.4)
59
+ rspec (3.2.0)
60
+ rspec-core (~> 3.2.0)
61
+ rspec-expectations (~> 3.2.0)
62
+ rspec-mocks (~> 3.2.0)
63
+ rspec-core (3.2.1)
64
+ rspec-support (~> 3.2.0)
65
+ rspec-expectations (3.2.0)
66
+ diff-lcs (>= 1.2.0, < 2.0)
67
+ rspec-support (~> 3.2.0)
68
+ rspec-mocks (3.2.1)
69
+ diff-lcs (>= 1.2.0, < 2.0)
70
+ rspec-support (~> 3.2.0)
71
+ rspec-support (3.2.2)
72
+ simplecov (0.9.2)
73
+ docile (~> 1.1.0)
74
+ multi_json (~> 1.0)
75
+ simplecov-html (~> 0.9.0)
76
+ simplecov-html (0.9.0)
77
+ thread_safe (0.3.4)
78
+ tzinfo (1.2.2)
79
+ thread_safe (~> 0.1)
80
+
81
+ PLATFORMS
82
+ ruby
83
+
84
+ DEPENDENCIES
85
+ activesupport
86
+ bundler
87
+ colorize
88
+ jeweler
89
+ rspec
90
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,165 @@
1
+ GNU LESSER GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+
9
+ This version of the GNU Lesser General Public License incorporates
10
+ the terms and conditions of version 3 of the GNU General Public
11
+ License, supplemented by the additional permissions listed below.
12
+
13
+ 0. Additional Definitions.
14
+
15
+ As used herein, "this License" refers to version 3 of the GNU Lesser
16
+ General Public License, and the "GNU GPL" refers to version 3 of the GNU
17
+ General Public License.
18
+
19
+ "The Library" refers to a covered work governed by this License,
20
+ other than an Application or a Combined Work as defined below.
21
+
22
+ An "Application" is any work that makes use of an interface provided
23
+ by the Library, but which is not otherwise based on the Library.
24
+ Defining a subclass of a class defined by the Library is deemed a mode
25
+ of using an interface provided by the Library.
26
+
27
+ A "Combined Work" is a work produced by combining or linking an
28
+ Application with the Library. The particular version of the Library
29
+ with which the Combined Work was made is also called the "Linked
30
+ Version".
31
+
32
+ The "Minimal Corresponding Source" for a Combined Work means the
33
+ Corresponding Source for the Combined Work, excluding any source code
34
+ for portions of the Combined Work that, considered in isolation, are
35
+ based on the Application, and not on the Linked Version.
36
+
37
+ The "Corresponding Application Code" for a Combined Work means the
38
+ object code and/or source code for the Application, including any data
39
+ and utility programs needed for reproducing the Combined Work from the
40
+ Application, but excluding the System Libraries of the Combined Work.
41
+
42
+ 1. Exception to Section 3 of the GNU GPL.
43
+
44
+ You may convey a covered work under sections 3 and 4 of this License
45
+ without being bound by section 3 of the GNU GPL.
46
+
47
+ 2. Conveying Modified Versions.
48
+
49
+ If you modify a copy of the Library, and, in your modifications, a
50
+ facility refers to a function or data to be supplied by an Application
51
+ that uses the facility (other than as an argument passed when the
52
+ facility is invoked), then you may convey a copy of the modified
53
+ version:
54
+
55
+ a) under this License, provided that you make a good faith effort to
56
+ ensure that, in the event an Application does not supply the
57
+ function or data, the facility still operates, and performs
58
+ whatever part of its purpose remains meaningful, or
59
+
60
+ b) under the GNU GPL, with none of the additional permissions of
61
+ this License applicable to that copy.
62
+
63
+ 3. Object Code Incorporating Material from Library Header Files.
64
+
65
+ The object code form of an Application may incorporate material from
66
+ a header file that is part of the Library. You may convey such object
67
+ code under terms of your choice, provided that, if the incorporated
68
+ material is not limited to numerical parameters, data structure
69
+ layouts and accessors, or small macros, inline functions and templates
70
+ (ten or fewer lines in length), you do both of the following:
71
+
72
+ a) Give prominent notice with each copy of the object code that the
73
+ Library is used in it and that the Library and its use are
74
+ covered by this License.
75
+
76
+ b) Accompany the object code with a copy of the GNU GPL and this license
77
+ document.
78
+
79
+ 4. Combined Works.
80
+
81
+ You may convey a Combined Work under terms of your choice that,
82
+ taken together, effectively do not restrict modification of the
83
+ portions of the Library contained in the Combined Work and reverse
84
+ engineering for debugging such modifications, if you also do each of
85
+ the following:
86
+
87
+ a) Give prominent notice with each copy of the Combined Work that
88
+ the Library is used in it and that the Library and its use are
89
+ covered by this License.
90
+
91
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
92
+ document.
93
+
94
+ c) For a Combined Work that displays copyright notices during
95
+ execution, include the copyright notice for the Library among
96
+ these notices, as well as a reference directing the user to the
97
+ copies of the GNU GPL and this license document.
98
+
99
+ d) Do one of the following:
100
+
101
+ 0) Convey the Minimal Corresponding Source under the terms of this
102
+ License, and the Corresponding Application Code in a form
103
+ suitable for, and under terms that permit, the user to
104
+ recombine or relink the Application with a modified version of
105
+ the Linked Version to produce a modified Combined Work, in the
106
+ manner specified by section 6 of the GNU GPL for conveying
107
+ Corresponding Source.
108
+
109
+ 1) Use a suitable shared library mechanism for linking with the
110
+ Library. A suitable mechanism is one that (a) uses at run time
111
+ a copy of the Library already present on the user's computer
112
+ system, and (b) will operate properly with a modified version
113
+ of the Library that is interface-compatible with the Linked
114
+ Version.
115
+
116
+ e) Provide Installation Information, but only if you would otherwise
117
+ be required to provide such information under section 6 of the
118
+ GNU GPL, and only to the extent that such information is
119
+ necessary to install and execute a modified version of the
120
+ Combined Work produced by recombining or relinking the
121
+ Application with a modified version of the Linked Version. (If
122
+ you use option 4d0, the Installation Information must accompany
123
+ the Minimal Corresponding Source and Corresponding Application
124
+ Code. If you use option 4d1, you must provide the Installation
125
+ Information in the manner specified by section 6 of the GNU GPL
126
+ for conveying Corresponding Source.)
127
+
128
+ 5. Combined Libraries.
129
+
130
+ You may place library facilities that are a work based on the
131
+ Library side by side in a single library together with other library
132
+ facilities that are not Applications and are not covered by this
133
+ License, and convey such a combined library under terms of your
134
+ choice, if you do both of the following:
135
+
136
+ a) Accompany the combined library with a copy of the same work based
137
+ on the Library, uncombined with any other library facilities,
138
+ conveyed under the terms of this License.
139
+
140
+ b) Give prominent notice with the combined library that part of it
141
+ is a work based on the Library, and explaining where to find the
142
+ accompanying uncombined form of the same work.
143
+
144
+ 6. Revised Versions of the GNU Lesser General Public License.
145
+
146
+ The Free Software Foundation may publish revised and/or new versions
147
+ of the GNU Lesser General Public License from time to time. Such new
148
+ versions will be similar in spirit to the present version, but may
149
+ differ in detail to address new problems or concerns.
150
+
151
+ Each version is given a distinguishing version number. If the
152
+ Library as you received it specifies that a certain numbered version
153
+ of the GNU Lesser General Public License "or any later version"
154
+ applies to it, you have the option of following the terms and
155
+ conditions either of that published version or of any later version
156
+ published by the Free Software Foundation. If the Library as you
157
+ received it does not specify a version number of the GNU Lesser
158
+ General Public License, you may choose any version of the GNU Lesser
159
+ General Public License ever published by the Free Software Foundation.
160
+
161
+ If the Library as you received it specifies that a proxy can decide
162
+ whether future versions of the GNU Lesser General Public License shall
163
+ apply, that proxy's public statement of acceptance of any version is
164
+ permanent authorization for you to choose that version for the
165
+ Library.
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ Pedophile
2
+ =========
3
+
4
+ Download static web pages.
5
+
6
+ Sample usage
7
+ ------------
8
+
9
+ <pre><code>
10
+ p = Pedophile::Downloader.new
11
+
12
+ p.url = "http://www.classnamer.com/"
13
+
14
+ # clear tmp directory
15
+ p.wget.clear!
16
+
17
+ # sign in using devise like form
18
+ #p.login.devise_login("http://www.classnamer.com/login", "email@email.com", "password")
19
+
20
+ # download, process
21
+ p.make_it_so
22
+
23
+ # zip into single file in tmp/site/site.zip
24
+ p.zip("site.zip")
25
+ </code></pre>
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "pedophile"
18
+ gem.homepage = "http://github.com/akwiatkowski/pedophile"
19
+ gem.license = "LGPLv3"
20
+ gem.summary = %Q{download static pages for offline usage}
21
+ gem.description = %Q{download static pages for offline usage.}
22
+ gem.email = "bobikx@poczta.fm"
23
+ gem.authors = ["Aleksander Kwiatkowski"]
24
+ # dependencies defined in Gemfile
25
+
26
+ gem.files = FileList[
27
+ "[A-Z]*", "{bin,generators,lib,test}/**/*"
28
+ ]
29
+ end
30
+ Jeweler::RubygemsDotOrgTasks.new
31
+
32
+ require 'rspec/core'
33
+ require 'rspec/core/rake_task'
34
+ RSpec::Core::RakeTask.new(:spec) do |spec|
35
+ spec.pattern = FileList['spec/**/*_spec.rb']
36
+ end
37
+
38
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
39
+ spec.pattern = 'spec/**/*_spec.rb'
40
+ spec.rcov = true
41
+ end
42
+
43
+ task :default => :spec
44
+
45
+ require 'rdoc/task'
46
+
47
+ desc "Run RSpec with code coverage"
48
+ task :coverage do
49
+ `rake spec COVERAGE=true`
50
+ #`open coverage/index.html`
51
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/lib/pedophile.rb ADDED
@@ -0,0 +1,11 @@
1
+ require 'colorize'
2
+
3
+ require 'pedophile/login'
4
+ require 'pedophile/wget'
5
+ require 'pedophile/offline_tree'
6
+ require 'pedophile/big_files'
7
+
8
+ require 'pedophile/downloader'
9
+
10
+ module Pedophile
11
+ end
@@ -0,0 +1,107 @@
1
+ require 'fileutils'
2
+ require 'yaml'
3
+
4
+ module Pedophile
5
+ class BigFiles
6
+ USE_MIME = false
7
+ TMP_STRUCTURE_PATH = File.absolute_path(File.join(Wget::TMP_PATH, "big_files.yaml"))
8
+
9
+ def initialize(downloader)
10
+ @downloader = downloader
11
+ @files = Array.new
12
+ end
13
+
14
+ attr_reader :downloader, :full_path, :files, :files_path
15
+
16
+ def offline_path
17
+ self.downloader.wget.offline_path
18
+ end
19
+
20
+ def copy_folder(path)
21
+ puts "copying big files path #{path.to_s.cyan}"
22
+ FileUtils.cp_r(path, offline_path)
23
+ puts "done copying path #{path.to_s.cyan}"
24
+ big_files_path = path
25
+ end
26
+
27
+ def big_files_path=(path)
28
+ @files_path = path
29
+ @full_path = File.join(offline_path, path)
30
+ end
31
+
32
+ def analyze
33
+ glob_path = "#{full_path}/**/**"
34
+ puts "big files path #{full_path.to_s.cyan}"
35
+
36
+ Dir.glob(glob_path) do |item|
37
+ next if item == '.' or item == '..' or File.directory?(item)
38
+
39
+ puts "analyze file #{item.to_s.yellow}"
40
+
41
+ h = Hash.new
42
+ h[:path] = item
43
+
44
+ if USE_MIME
45
+ mime = `file --mime #{item}`
46
+ if mime =~ /(\w+\/\w+);/
47
+ mime = $1
48
+ else
49
+ mime = nil
50
+ end
51
+ h[:mime] = mime
52
+ end
53
+
54
+ @files << h
55
+ end
56
+
57
+ save_analyzed
58
+ end
59
+
60
+ def save_analyzed
61
+ f = File.new(TMP_STRUCTURE_PATH, "w")
62
+ f.puts @files.to_yaml
63
+ f.close
64
+ end
65
+
66
+ def load_analyzed
67
+ @files = YAML.load_file(TMP_STRUCTURE_PATH)
68
+ end
69
+
70
+ def gsub_links
71
+ files.each do |f|
72
+ file_path = f[:path].clone
73
+ smaller_path = file_path.gsub(full_path, "")
74
+ smaller_path.gsub!(/^\//, '')
75
+
76
+ gsub_big_file(smaller_path)
77
+ f[:done] = true
78
+ end
79
+ end
80
+
81
+ def gsub_big_file(smaller_path)
82
+ puts "process big file #{smaller_path.to_s.green}"
83
+
84
+ self.downloader.offline_tree.files.each do |f|
85
+ if f[:inside]
86
+ to_rename = f[:inside].select do |fi|
87
+ fi[:path].index(smaller_path)
88
+ end
89
+
90
+ # TODO gsub path issue with html files inside
91
+ to_rename.each do |fi|
92
+ original_string = fi[:path]
93
+ new_string = File.join(files_path, smaller_path)
94
+
95
+ puts "rename big file #{original_string.to_s.blue} to #{new_string.to_s.green}"
96
+
97
+ self.downloader.offline_tree.process_massive_gsub(original_string, new_string, true)
98
+ end
99
+
100
+
101
+ end
102
+ end
103
+
104
+ end
105
+
106
+ end
107
+ end
@@ -0,0 +1,25 @@
1
+ require 'active_support/all'
2
+
3
+ module Pedophile
4
+ class Downloader
5
+ def initialize
6
+ @login = Login.new(self)
7
+ @wget = Wget.new(self)
8
+ @offline_tree = OfflineTree.new(self)
9
+ @big_files = BigFiles.new(self)
10
+ end
11
+
12
+ attr_reader :login, :wget, :offline_tree, :big_files
13
+ attr_accessor :url
14
+
15
+ def make_it_so
16
+ wget.mirror
17
+ offline_tree.make_it_so
18
+ end
19
+
20
+ def zip(name = "site.zip")
21
+ offline_tree.zip(name)
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,54 @@
1
+ require 'uri'
2
+ require 'colorize'
3
+
4
+ module Pedophile
5
+ class Login
6
+
7
+ attr_reader :downloader
8
+
9
+ def initialize(downloader)
10
+ @downloader = downloader
11
+ end
12
+
13
+ def devise_login(url, email, password)
14
+ uri = URI.parse(url)
15
+
16
+ string = @downloader.wget.download(url)
17
+
18
+ token = nil
19
+ if string =~ /<input name=\"authenticity_token\" type=\"hidden\" value=\"([^"]+)\" \/>/
20
+ token = $1
21
+ puts "got devise token #{token.to_s.blue}"
22
+ end
23
+
24
+ action_url = nil
25
+ if string =~ /action=\"([^"]+)\"/
26
+ action_url = $1
27
+ puts "got action url #{action_url.to_s.blue}"
28
+ end
29
+
30
+ sign_url = "http://#{uri.host}#{action_url}"
31
+ puts "sign action url #{sign_url.to_s.blue}"
32
+
33
+ post_params = {
34
+ "authenticity_token" => token,
35
+ "user" => {
36
+ "email" => email,
37
+ "password" => password,
38
+ "remember_me" => 1
39
+ }
40
+ }
41
+ post_params = {
42
+ "utf8"=>"✓",
43
+ "authenticity_token" => token,
44
+ "user[email]" => email,
45
+ "user[password]" => password,
46
+ "user[remember_me]" => 1
47
+ }
48
+
49
+ string = @downloader.wget.post(url, post_params)
50
+ string
51
+ end
52
+
53
+ end
54
+ end
@@ -0,0 +1,307 @@
1
+ require 'yaml'
2
+ require 'pathname'
3
+
4
+ module Pedophile
5
+ class OfflineTree
6
+ TMP_STRUCTURE_PATH = File.absolute_path(File.join(Wget::TMP_PATH, "files.yaml"))
7
+ TMP_CHANGES_PATH = File.absolute_path(File.join(Wget::TMP_PATH, "changes.yaml"))
8
+ FIX_RELATIVE_PATH = false
9
+
10
+ def initialize(downloader)
11
+ @downloader = downloader
12
+ @files = Array.new
13
+ @changes = Array.new
14
+ end
15
+
16
+ attr_reader :downloader, :files
17
+
18
+ def make_it_so
19
+ analyze
20
+ load_analyzed
21
+
22
+ process_bad_suffix1
23
+ process_bad_suffix2
24
+ process_bad_filenames
25
+ save_analyzed
26
+ save_changes
27
+ end
28
+
29
+ def zip(output_file = 'site.zip')
30
+ command = "cd #{Wget::TMP_OFFLINE_PATH}; zip -r #{output_file} #{self.downloader.wget.site_last_path}"
31
+ puts command
32
+ `#{command}`
33
+ end
34
+
35
+ # Desctructive part
36
+ def after_process
37
+ load_processed
38
+ remove_bad_suffix
39
+ rename_files
40
+ end
41
+
42
+ def path
43
+ @path ||= self.downloader.wget.offline_path
44
+ @path
45
+ end
46
+
47
+ def analyze
48
+ # because I don't want to read all wget options...
49
+ glob_path = "#{path}/**/**"
50
+ puts "offline path #{path.to_s.cyan}"
51
+
52
+ Dir.glob(glob_path) do |item|
53
+ next if item == '.' or item == '..' or File.directory?(item)
54
+
55
+ puts "analyze file #{item.to_s.yellow}"
56
+
57
+ h = Hash.new
58
+ h[:path] = item
59
+
60
+ mime = `file --mime #{item}`
61
+ if mime =~ /(\w+\/\w+);/
62
+ mime = $1
63
+ else
64
+ mime = nil
65
+ end
66
+
67
+ h[:mime] = mime
68
+
69
+ if mime == 'text/html' or mime == 'text/plain'
70
+ h[:inside] = analyze_file(item)
71
+ end
72
+
73
+ @files << h
74
+ end
75
+
76
+ save_analyzed
77
+ end
78
+
79
+ def save_analyzed
80
+ f = File.new(TMP_STRUCTURE_PATH, "w")
81
+ f.puts @files.to_yaml
82
+ f.close
83
+ end
84
+
85
+ def save_changes
86
+ f = File.new(TMP_CHANGES_PATH, "w")
87
+ f.puts @changes.to_yaml
88
+ f.close
89
+ end
90
+
91
+ def load_analyzed
92
+ @files = YAML.load_file(TMP_STRUCTURE_PATH)
93
+ end
94
+
95
+ def analyze_file(file)
96
+ s = File.read(file)
97
+
98
+ possible_paths_regexp = /"([^"]+)"/
99
+ possible_paths = s.scan(possible_paths_regexp).flatten.uniq
100
+
101
+ possible_paths_regexp = /'([^']+)'/
102
+ possible_paths += s.scan(possible_paths_regexp).flatten.uniq
103
+
104
+ relative_file_path = File.dirname(file)
105
+
106
+ paths = Array.new
107
+ possible_paths.each do |pp|
108
+ if is_path_ok?(pp)
109
+ h = Hash.new
110
+ f = File.join(relative_file_path, pp)
111
+ h[:exists] = File.exists?(f)
112
+ h[:is_file] = File.file?(f)
113
+ h[:path] = pp
114
+
115
+ paths << h if should_add_path?(h)
116
+ end
117
+ end
118
+
119
+ paths
120
+ end
121
+
122
+ # TODO - check if this string is correct unix path
123
+ def is_path_ok?(pp)
124
+ # pp =~ /\A(?:[0-9a-zA-Z\_\-]+\/?)+\z/
125
+ pp.size < 200
126
+ end
127
+
128
+ # TODO
129
+ def should_add_path?(h)
130
+ return true
131
+ #return h[:is_file]
132
+ end
133
+
134
+ def base_path
135
+ @base_path ||= self.downloader.wget.offline_path
136
+ @base_path
137
+ end
138
+
139
+ # PROCESSING
140
+ def process_bad_suffix2
141
+ @files.each do |f|
142
+ old_file = f[:path]
143
+ new_file = old_file.gsub(/\?body=1/, '')
144
+
145
+ if not new_file == old_file
146
+ process_rename_file(old_file, new_file)
147
+ end
148
+ end
149
+
150
+ process_massive_gsub("%3Fbody=1", "", false)
151
+ end
152
+
153
+ def process_bad_suffix1
154
+ @files.each do |f|
155
+ old_file = f[:path]
156
+ new_file = old_file.gsub(/\?\d+/, '').gsub(/\%3F\d+/, '')
157
+
158
+ if not new_file == old_file
159
+ process_rename_file(old_file, new_file)
160
+ end
161
+
162
+ if f[:inside]
163
+ f[:inside].each do |fi|
164
+ old_file = fi[:path]
165
+ if File.exists?(old_file)
166
+ new_file = old_file.gsub(/\?\d+/, '').gsub(/\%3F\d+/, '')
167
+
168
+ if not new_file == old_file
169
+ process_rename_file(old_file, new_file)
170
+ end
171
+
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ process_massive_gsub(/\%3F\d+/, "", false)
178
+ end
179
+
180
+ def process_bad_filenames
181
+ @files.each do |f|
182
+ old_file = f[:path]
183
+ new_file = old_file.gsub(/[^0-9A-Za-z.\-\/:]/, '_')
184
+
185
+ if not new_file == old_file
186
+ process_rename_file(old_file, new_file)
187
+ end
188
+
189
+ if f[:inside]
190
+ f[:inside].each do |fi|
191
+ old_file = fi[:path]
192
+ if File.exists?(old_file)
193
+ new_file = old_file.gsub(/[^0-9A-Za-z.\-\/:]/, '_')
194
+
195
+ if not new_file == old_file
196
+ process_rename_file(old_file, new_file)
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+
204
+ #def process_bad_filenames_links
205
+ # process_massive_gsub(/\%3F/, "_", false)
206
+ #end
207
+
208
+ def process_rename_file(old_file_path, new_file_path)
209
+ puts "rename from #{old_file_path.to_s.blue} to #{new_file_path.to_s.green}"
210
+
211
+ # clone to not allow modify of @files
212
+ old_file = old_file_path.clone
213
+ new_file = new_file_path.clone
214
+ # this will be with full path
215
+ old_file_with_path = old_file_path.clone
216
+
217
+ old_file.gsub!(base_path, '')
218
+ new_file.gsub!(base_path, '')
219
+
220
+ # ignore slashes
221
+ old_file.gsub!(/^\//, '')
222
+ new_file.gsub!(/^\//, '')
223
+
224
+ # 1. rename 1 file
225
+ new_file_path = old_file_with_path.gsub(old_file, new_file)
226
+ File.rename(old_file_with_path, new_file_path)
227
+
228
+ # internal log-like
229
+ @changes << { rename: { old: old_file_with_path, new: new_file_path } }
230
+
231
+ # 2. rename in @files
232
+ @files.each do |f|
233
+ if f[:path] == old_file_with_path
234
+ f[:path] = new_file_path
235
+ end
236
+
237
+ if f[:inside]
238
+ f[:inside].each do |fi|
239
+ if fi[:path] == old_file_with_path
240
+ fi[:path] = new_file_path
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+ # 3. gsub all files
247
+ # gsub files after renaming
248
+ process_massive_gsub(old_file, new_file, true)
249
+ process_massive_gsub(old_file.gsub("?", "%3F"), new_file, true)
250
+
251
+ puts "RENAMED #{old_file.to_s.blue} to #{new_file.to_s.green}"
252
+ end
253
+
254
+ def process_massive_gsub(from, to, check_paths = false)
255
+ puts "massive gsub #{from.to_s.blue} to #{to.to_s.green}"
256
+
257
+ @files.each do |f|
258
+ # must be proper mime before, so not needed to check
259
+ if f[:inside]
260
+ file_path = f[:path].clone
261
+
262
+ puts " open #{file_path.to_s.red}"
263
+
264
+ old_from = from.to_s
265
+ old_to = to.to_s
266
+
267
+ # relative path fix
268
+ if check_paths and FIX_RELATIVE_PATH
269
+ absolute_path = File.absolute_path(File.dirname(file_path))
270
+ first = Pathname.new(absolute_path)
271
+
272
+ to_path = File.join(path, to)
273
+ second = Pathname.new(File.absolute_path(to_path))
274
+ to = second.relative_path_from(first).to_s
275
+ end
276
+
277
+ exists = File.exists?(file_path)
278
+ if exists
279
+ j = File.open(file_path)
280
+ s = j.read
281
+ j.close
282
+
283
+ # logs
284
+ if s.index(from)
285
+ @changes << { gsub: { old: from, new: to, file: file_path, old_from: old_from, old_to: old_to } }
286
+ end
287
+
288
+ s.gsub!(from, to)
289
+
290
+ j = File.open(file_path, "w")
291
+ j.puts(s)
292
+ j.close
293
+
294
+ f[:inside].each do |fi|
295
+ fi[:path].gsub!(from, to)
296
+ end
297
+
298
+ puts " done #{file_path.to_s.red}"
299
+ else
300
+ raise "file #{file_path} not found"
301
+ end
302
+ end
303
+ end
304
+ end
305
+
306
+ end
307
+ end
@@ -0,0 +1,59 @@
1
+ require 'active_support/all'
2
+ require 'fileutils'
3
+
4
+ module Pedophile
5
+ class Wget
6
+ TMP_PATH = "tmp"
7
+ TMP_ABSOLUTE_PATH = File.absolute_path(TMP_PATH)
8
+ TMP_FILE_PATH = File.absolute_path(File.join(TMP_PATH, "tmp.tmp"))
9
+ COOKIES_FILE_PATH = File.absolute_path(File.join(TMP_PATH, "cookies.txt"))
10
+ TMP_OFFLINE_PATH = File.join(TMP_PATH, "site")
11
+
12
+ WGET_PARAMS = "-v --random-wait --user-agent=Mozilla/5.0 --keep-session-cookies --load-cookies #{COOKIES_FILE_PATH} --save-cookies #{COOKIES_FILE_PATH}"
13
+ # http://www.gnu.org/software/wget/manual/html_node/Download-Options.html
14
+ #WGET_RESTRICT_FILE_NAMES = "windows" # windows, ascii, unix
15
+ WGET_RESTRICT_FILE_NAMES = "unix"
16
+ WGET_MIRROR_PARAMS = "--adjust-extension --mirror --page-requisites --convert-links --restrict-file-names=#{WGET_RESTRICT_FILE_NAMES}"
17
+
18
+ def initialize(downloader)
19
+ @downloader = downloader
20
+ prepare_tmp_path
21
+ end
22
+
23
+ attr_reader :downloader
24
+
25
+ def prepare_tmp_path
26
+ Dir.mkdir(TMP_PATH) unless File.exists?(TMP_PATH)
27
+ Dir.mkdir(TMP_OFFLINE_PATH) unless File.exists?(TMP_OFFLINE_PATH)
28
+ end
29
+
30
+ def download(url)
31
+ `wget #{WGET_PARAMS} #{url} -O#{TMP_FILE_PATH}`
32
+ File.open(TMP_FILE_PATH).read
33
+ end
34
+
35
+ def post(url, params)
36
+ post_data = params.to_query
37
+ `wget #{WGET_PARAMS} #{url} --post-data '#{post_data}' -O#{TMP_FILE_PATH}`
38
+ File.open(TMP_FILE_PATH).read
39
+ end
40
+
41
+ def mirror
42
+ `cd #{TMP_OFFLINE_PATH}; wget #{WGET_PARAMS} #{WGET_MIRROR_PARAMS} #{self.downloader.url}`
43
+ end
44
+
45
+ def clear!
46
+ FileUtils.rm_rf(TMP_ABSOLUTE_PATH)
47
+ prepare_tmp_path
48
+ end
49
+
50
+ def site_last_path
51
+ (Dir.entries(Wget::TMP_OFFLINE_PATH) - ["..", "."]).first
52
+ end
53
+
54
+ def offline_path
55
+ File.join(TMP_OFFLINE_PATH, site_last_path)
56
+ end
57
+
58
+ end
59
+ end
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pedophile
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Aleksander Kwiatkowski
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: colorize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: jeweler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: download static pages for offline usage.
98
+ email: bobikx@poczta.fm
99
+ executables: []
100
+ extensions: []
101
+ extra_rdoc_files:
102
+ - LICENSE.txt
103
+ - README.md
104
+ files:
105
+ - Gemfile
106
+ - Gemfile.lock
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - VERSION
111
+ - lib/pedophile.rb
112
+ - lib/pedophile/big_files.rb
113
+ - lib/pedophile/downloader.rb
114
+ - lib/pedophile/login.rb
115
+ - lib/pedophile/offline_tree.rb
116
+ - lib/pedophile/wget.rb
117
+ homepage: http://github.com/akwiatkowski/pedophile
118
+ licenses:
119
+ - LGPLv3
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.2.2
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: download static pages for offline usage
141
+ test_files: []