apple_epf 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README.md +90 -0
  3. data/Rakefile +35 -0
  4. data/lib/apple_epf.rb +43 -0
  5. data/lib/apple_epf/downloader.rb +164 -0
  6. data/lib/apple_epf/errors.rb +8 -0
  7. data/lib/apple_epf/extractor.rb +43 -0
  8. data/lib/apple_epf/logging.rb +32 -0
  9. data/lib/apple_epf/main.rb +113 -0
  10. data/lib/apple_epf/parser.rb +95 -0
  11. data/lib/apple_epf/version.rb +3 -0
  12. data/lib/core_ext/array.rb +30 -0
  13. data/lib/core_ext/module.rb +63 -0
  14. data/lib/tasks/apple_epf_tasks.rake +4 -0
  15. data/spec/lib/apple_epf/downloader_spec.rb +220 -0
  16. data/spec/lib/apple_epf/exctractor_spec.rb +72 -0
  17. data/spec/lib/apple_epf/main_spec.rb +185 -0
  18. data/spec/lib/apple_epf/parser_spec.rb +66 -0
  19. data/spec/spec_helper.rb +40 -0
  20. data/spec/support/fixture_helper.rb +13 -0
  21. data/spec/support/fixtures/itunes/epf/current_full_list.html +20 -0
  22. data/spec/support/fixtures/itunes/epf/current_inc_list.html +19 -0
  23. data/spec/support/fixtures/itunes/epf/incremental/itunes20130111.tbz +0 -0
  24. data/spec/support/fixtures/itunes/epf/incremental/itunes20130111/application +21 -0
  25. data/spec/support/fixtures/itunes/epf/incremental/itunes20130111/application_with_nil +7 -0
  26. data/spec/support/fixtures/itunes/epf/incremental/itunes20130111/test_file.txt +0 -0
  27. data/spec/support/fixtures/itunes/epf/incremental/popularity20130111.tbz +0 -0
  28. data/spec/support/fixtures/itunes/epf/incremental/popularity20130111/popularity1 +0 -0
  29. data/spec/support/fixtures/itunes/epf/incremental/popularity20130111/popularity2 +0 -0
  30. metadata +255 -0
@@ -0,0 +1,20 @@
1
+ Copyright 2013 Artem Kramrenko
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,90 @@
1
+ # AppleEpf
2
+
3
+ ## Installation
4
+
5
+ gem 'apple_epf'
6
+
7
+ ## Download incremental
8
+
9
+ AppleEpf.get_incremental( 'current',
10
+ lambda { |header| puts header },
11
+ lambda { |row| puts row },
12
+ lambda { |success| puts 'Yeah!' if success } )
13
+
14
+ ## Setup
15
+
16
+ Put this in your initializer.rb if you are using Rails.
17
+
18
+ AppleEpf.configure do |config|
19
+ config.apple_id = 'username'
20
+ config.apple_password = 'password'
21
+ config.download_retry_count = 3 #
22
+ config.keep_tbz_after_extract = false
23
+ config.extract_dir = '' # where to extract to
24
+ config.log_file = 'path to log file' #absolute path to log file
25
+ config.files_matrix = {} # {popularity: ['application_popularity_per_genre']}
26
+ end
27
+
28
+ All of this can be redefined for every downloader.
29
+
30
+ ## Manual manipulations
31
+
32
+ # Manually download one file
33
+ downloader = AppleEpf::Downloader.new('incremental', 'popularity', Date.parse('17-01-2013'))
34
+ downloader.download #=> will return local filepath to downloaded file or fire exception
35
+
36
+
37
+ # Manually extract one archive
38
+ extractor = AppleEpf::Extractor.new(filename, files_to_extract)
39
+ # filename - full path to local file
40
+ # files_to_extract - Files to be extracted from Archive (application, application_detail)
41
+ file_entry = extractor.perform #=> will return instance of FileEntry
42
+ file_entry.tbz_file #=> original file that was parsed. It is removed after untaring
43
+ file_entry.extracted_files #=> newly created(unpacked) files
44
+
45
+ #Manually parse file
46
+ parser = AppleEpf::Parser.new(filename, header_block, row_block)
47
+ # filename - full local path to file
48
+ # header_block - will return header info from file
49
+ # row_block - will return each row from file
50
+
51
+ ## Download and Extract
52
+ If you want to combine downloading and extracting your can use one of following
53
+ methonds. My personal feeling is to parsing should we something live alone and should not be combined in one stack with download and extract. And of cource it is better to download and extract files one by one.
54
+
55
+ manager = AppleEpf::Incremental.new('10-10-2012', {popularity: ['application_popularity_per_genre']})
56
+
57
+ manager = AppleEpf::Full.new('10-10-2012', {popularity: ['application_popularity_per_genre']})
58
+
59
+ manager.download_all_files #will download all files for this date for all keys "popularity", 'pricing', 'itunes' etc
60
+
61
+ manager.download_and_extract_all_files #will first download and than extract all files
62
+
63
+ manager.download_and_extract('itunes', ['application', 'application_detail']) #will donwload only 'itunes' and extract only ['application', 'application_detail']. This actually ignored matrix passed to initializer
64
+
65
+ manager.download('itunes') #will only download file
66
+
67
+ You can omit where to store files by setting it directly to downloader instance
68
+
69
+ manager.store_dir = '/whatever_dir_you_like'
70
+ manager.download('itunes')
71
+
72
+ OR
73
+
74
+ downloader = AppleEpf::Downloader.new('incremental', 'popularity', Date.parse('17-01-2013'))
75
+ downloader.dirpath = '/whatever_dir_you_like'
76
+ downloader.download
77
+
78
+ You can also omit if you want to store initial tbz files after they will be unpacked
79
+
80
+ extractor.keep_tbz_after_extract = true
81
+
82
+ OR
83
+
84
+ manager.keep_tbz_after_extract = true
85
+
86
+
87
+ ## Get list of current files avaliable for download
88
+
89
+ AppleEpf::Incremental.get_current_list #=> current incremental files
90
+ AppleEpf::Full.get_current_list #=> current full files
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env rake
2
+ begin
3
+ require 'bundler/setup'
4
+ rescue LoadError
5
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
6
+ end
7
+ begin
8
+ require 'rdoc/task'
9
+ rescue LoadError
10
+ require 'rdoc/rdoc'
11
+ require 'rake/rdoctask'
12
+ RDoc::Task = Rake::RDocTask
13
+ end
14
+
15
+ RDoc::Task.new(:rdoc) do |rdoc|
16
+ rdoc.rdoc_dir = 'rdoc'
17
+ rdoc.title = 'AppleEpf'
18
+ rdoc.options << '--line-numbers'
19
+ rdoc.rdoc_files.include('lib/**/*.rb')
20
+ end
21
+
22
+
23
+ Bundler::GemHelper.install_tasks
24
+
25
+ require 'rake/testtask'
26
+
27
+ Rake::TestTask.new(:test) do |t|
28
+ t.libs << 'lib'
29
+ t.libs << 'test'
30
+ t.pattern = 'test/**/*_test.rb'
31
+ t.verbose = false
32
+ end
33
+
34
+
35
+ task :default => :test
@@ -0,0 +1,43 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'chronic'
4
+ require 'core_ext/array'
5
+ require 'core_ext/module'
6
+ require 'apple_epf/errors'
7
+ require 'apple_epf/logging'
8
+ require 'apple_epf/main'
9
+ require 'apple_epf/downloader'
10
+ require 'apple_epf/extractor'
11
+ require 'apple_epf/parser'
12
+
13
+ module AppleEpf
14
+ FILE_TYPES = %w( full incremental )
15
+
16
+ mattr_accessor :apple_id
17
+ @@apple_id = "test"
18
+
19
+ mattr_accessor :apple_password
20
+ @@apple_password = "test"
21
+
22
+ mattr_accessor :download_retry_count
23
+ @@download_retry_count = 3
24
+
25
+ mattr_accessor :keep_tbz_after_extract
26
+ @@keep_tbz_after_extract = false
27
+
28
+ mattr_accessor :extract_dir
29
+ @@extract_dir = [Dir.tmpdir, 'epm_files'].join('/')
30
+
31
+ mattr_accessor :log_to_console
32
+ @@keep_tbz_after_extract = log_to_console
33
+
34
+ mattr_accessor :log_file
35
+ @@log_file = '/dev/null'
36
+
37
+ mattr_accessor :files_matrix
38
+ @@files_matrix = {popularity: ['application_popularity_per_genre']}.freeze
39
+
40
+ def self.configure
41
+ yield self
42
+ end
43
+ end
@@ -0,0 +1,164 @@
1
+ require 'net/http'
2
+ require 'date'
3
+ require 'curb'
4
+ require 'digest/md5'
5
+
6
+ module AppleEpf
7
+ class Downloader
8
+ include AppleEpf::Logging
9
+ ITUNES_FLAT_FEED_URL = 'http://feeds.itunes.apple.com/feeds/epf/v3/full'.freeze
10
+
11
+ attr_accessor :type, :filename, :filedate
12
+
13
+ attr_reader :download_to, :apple_filename_full
14
+ attr_writer :dirpath
15
+ def initialize(type, filename, filedate)
16
+ @type = type
17
+ @filename = filename #itunes, popularity, match, pricing
18
+ @filedate = filedate
19
+ end
20
+
21
+ def download
22
+ _prepare_folders
23
+ get_filename_by_date_and_type
24
+
25
+ @apple_filename_full = apple_filename_full_url(@apple_filename_full_path)
26
+ @download_to = File.join(dirpath, File.basename(@apple_filename_full))
27
+
28
+ logger_info "Download file: #{@apple_filename_full}"
29
+ logger_info "Download to: #{@download_to}"
30
+
31
+ @download_retry = 0
32
+ start_download
33
+ download_and_compare_md5_checksum
34
+ @download_to
35
+ end
36
+
37
+ def dirpath
38
+ File.join((@dirpath || AppleEpf.extract_dir), @type)
39
+ end
40
+
41
+ #TODO combine with start_download
42
+ def download_and_compare_md5_checksum
43
+ begin
44
+ curl = Curl::Easy.new("#{@apple_filename_full}.md5")
45
+ curl.http_auth_types = :basic
46
+ curl.username = AppleEpf.apple_id
47
+ curl.password = AppleEpf.apple_password
48
+ curl.perform
49
+ @md5_checksum = curl.body_str.match(/.*=(.*)/)[1].strip
50
+ rescue NoMethodError
51
+ raise AppleEpf::Md5CompareError.new('Md5 of downloaded file is not the same as apple provide')
52
+ end
53
+
54
+ if Digest::MD5.file(@download_to).hexdigest != @md5_checksum
55
+ raise AppleEpf::Md5CompareError.new('Md5 of downloaded file is not the same as apple provide')
56
+ end
57
+
58
+ @md5_checksum
59
+ end
60
+
61
+ def get_filename_by_date_and_type
62
+ #today = DateTime.now
63
+ path = ""
64
+ case @type
65
+ when "full"
66
+ path = "#{main_dir_date}/#{@filename}#{main_dir_date}.tbz"
67
+
68
+ when "incremental"
69
+ date_of_file = date_to_epf_format(@filedate)
70
+ path = "#{main_dir_date}/incremental/#{date_of_file}/#{@filename}#{date_of_file}.tbz"
71
+
72
+ when "file"
73
+ #TODO: FIX THIS
74
+ # date = date_to_epf_format( @filedate, check_if_in_previous_week, check_if_in_thursday )
75
+ # path = "#{file}#{date}.tbz"
76
+ end
77
+
78
+ # Return false if no url was suggested or file does not exist
79
+ raise AppleEpf::DownloaderError.new("Unable to find out what file do you want to download") if path.empty?
80
+ raise AppleEpf::FileNotExist.new("File does not exist #{path}") unless file_exists?(path)
81
+
82
+ @apple_filename_full_path = path
83
+ @apple_filename_full_path
84
+ end
85
+
86
+ def downloaded_file_base_name
87
+ File.basename(@download_to, '.tbz') #popularity20130109
88
+ end
89
+
90
+ private
91
+
92
+ def apple_filename_full_url(path)
93
+ File.join(ITUNES_FLAT_FEED_URL, path)
94
+ end
95
+
96
+ def _prepare_folders
97
+ logger_info "Create folders for path: #{dirpath}"
98
+ FileUtils.mkpath(dirpath)
99
+ end
100
+
101
+ def main_dir_date
102
+ if @type == "incremental"
103
+ # from Mon to Thurday dumps are in prev week folder
104
+ this_or_last = @filedate.wday <= 4 ? 'last' : 'this'
105
+ elsif @type == "full"
106
+ # full downloads usually are done only once. user can determine when it should be done
107
+ this_or_last = 'this'
108
+ end
109
+
110
+ main_folder_date = Chronic.parse("#{this_or_last} week wednesday", :now => @filedate.to_time).to_date
111
+ date_to_epf_format(main_folder_date)
112
+ end
113
+
114
+ def date_to_epf_format(date)
115
+ date.strftime("%Y%m%d")
116
+ end
117
+
118
+ def file_exists?(path_to_check)
119
+ full_url = apple_filename_full_url(path_to_check)
120
+ logger_info "Checking file at URL: #{full_url}"
121
+
122
+ uri = URI.parse(full_url)
123
+
124
+ request = Net::HTTP::Head.new(full_url)
125
+ request.basic_auth(AppleEpf.apple_id, AppleEpf.apple_password)
126
+
127
+ response = Net::HTTP.new(uri.host, uri.port).start { |http| http.request(request) }
128
+
129
+ raise AppleEpf::BadCredentialsError.new('Bad credentials') if response.code == "401"
130
+
131
+ response.code == "200"
132
+ end
133
+
134
+ def start_download
135
+ begin
136
+ curl = Curl::Easy.new(@apple_filename_full)
137
+
138
+ # Authentication
139
+ curl.http_auth_types = :basic
140
+ curl.username = AppleEpf.apple_id
141
+ curl.password = AppleEpf.apple_password
142
+
143
+ File.open(@download_to, 'wb') do |f|
144
+ curl.on_body { |data|
145
+ f << data;
146
+ data.size
147
+ }
148
+ curl.perform
149
+ end
150
+ rescue Curl::Err::PartialFileError => ex
151
+ if @download_retry < AppleEpf.download_retry_count
152
+ @download_retry += 1
153
+
154
+ logger_info "Curl::Err::PartialFileError happened..."
155
+ logger_info "Restarting download"
156
+ start_download
157
+ else
158
+ raise AppleEpf::CurlError.new("Unable to download file.")
159
+ end
160
+ end
161
+ end
162
+
163
+ end
164
+ end
@@ -0,0 +1,8 @@
1
+ module AppleEpf
2
+ class DownloaderError < StandardError; end
3
+ class FileNotExist < DownloaderError; end
4
+ class CurlError < DownloaderError; end
5
+ class Md5CompareError < DownloaderError; end
6
+
7
+ class BadCredentialsError < StandardError; end
8
+ end
@@ -0,0 +1,43 @@
1
+ module AppleEpf
2
+ class Extractor
3
+ class FileEntry < Struct.new(:tbz_file, :extracted_files); end
4
+
5
+ attr_reader :file_entry, :filename, :dirname, :basename
6
+ attr_accessor :keep_tbz_after_extract
7
+
8
+ def initialize(filename, files_to_extract)
9
+ @filename = filename
10
+ @files_to_extract = files_to_extract
11
+
12
+ @dirname = File.dirname(@filename)
13
+ @basename = File.basename(@filename)
14
+ end
15
+
16
+ #TODO use multithread uncompressing tool
17
+ def perform
18
+ @extracted_files = Array.new
19
+ @files_to_extract.each do |f|
20
+ @extracted_files.push File.basename(@filename, '.tbz') + '/' + f
21
+ end
22
+
23
+ result = system "cd #{@dirname} && tar -xjf #{@basename} #{@extracted_files.join(' ')}"
24
+
25
+ if result
26
+ _extracted_files = @extracted_files.map{|f| File.join(@dirname, f)}
27
+ @file_entry = FileEntry.new(@filename, Hash[@files_to_extract.zip(_extracted_files)])
28
+ FileUtils.remove_file(@filename, true) unless keep_tbz_after_extract?
29
+ else
30
+ raise "Unable to extract files '#{@files_to_extract.join(' ')}' from #{@filename}"
31
+ end
32
+
33
+ @file_entry
34
+ end
35
+
36
+ private
37
+ def keep_tbz_after_extract?
38
+ !!keep_tbz_after_extract || AppleEpf.keep_tbz_after_extract
39
+ end
40
+ end
41
+
42
+
43
+ end
@@ -0,0 +1,32 @@
1
+ require 'logger'
2
+
3
+ module AppleEpf
4
+ module Logging
5
+ def self.logger
6
+ @logger ||= initialize_logger
7
+ end
8
+
9
+ def logger
10
+ AppleEpf::Logging.logger
11
+ end
12
+
13
+ def logger_info(data)
14
+ logger.info(data)
15
+ p data if AppleEpf.log_to_console
16
+ end
17
+
18
+ private
19
+
20
+ def self.initialize_logger
21
+ begin
22
+ logfile = File.open(AppleEpf::log_file, File::WRONLY | File::APPEND | File::CREAT)
23
+ logger = Logger.new(logfile, 'weekly')
24
+ logger.level = Logger::DEBUG
25
+ logger
26
+ rescue
27
+ p "Unable to create logger"
28
+ raise $!
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,113 @@
1
+ require 'nokogiri'
2
+
3
+ module AppleEpf
4
+ class Main
5
+ attr_reader :downloader, :filedate, :files_matrix
6
+ attr_accessor :store_dir, :keep_tbz_after_extract
7
+
8
+ def initialize(filedate, files_matrix = nil, store_dir = nil, keep_tbz_after_extract = nil)
9
+ @filedate = filedate
10
+ @files_matrix = files_matrix || AppleEpf.files_matrix
11
+ @store_dir = store_dir
12
+ @keep_tbz_after_extract = !!keep_tbz_after_extract || AppleEpf.keep_tbz_after_extract
13
+ end
14
+
15
+ def self.get_current_list
16
+ curl = Curl::Easy.new(self.current_url)
17
+ curl.http_auth_types = :basic
18
+ curl.username = AppleEpf.apple_id
19
+ curl.password = AppleEpf.apple_password
20
+ curl.perform
21
+ body = curl.body_str
22
+ files = Nokogiri::HTML(body).xpath("//td[2]/a").map(&:text).select{|s| s=~/.*tbz$/}.map{|f| f.chomp('.tbz')}
23
+ files.inject({}) do |all, e|
24
+ e.match(/([a-z]*)(\d*)/)
25
+ all[$1] = $2
26
+ all
27
+ end
28
+ end
29
+
30
+ module BaseActions
31
+ def download_all_files(&block)
32
+ downloaded_files = []
33
+
34
+ @files_matrix.each_pair do |filename, extractables|
35
+ begin
36
+ downloaded = self.download(filename)
37
+ downloaded_files << downloaded
38
+ block.call(downloaded) if block_given?
39
+ rescue AppleEpf::DownloaderError
40
+ AppleEpf::Logging.logger.fatal "Failed to download file #{filename}"
41
+ AppleEpf::Logging.logger.fatal $!
42
+ next
43
+ end
44
+ end
45
+
46
+ downloaded_files
47
+ end
48
+
49
+ def download_and_extract_all_files(&block)
50
+ extracted_files = []
51
+
52
+ @files_matrix.each_pair do |filename, extractables|
53
+ begin
54
+ extracted_file = download_and_extract(filename.to_s, extractables)
55
+ extracted_files << extracted_file
56
+ block.call(extracted_file) if block_given?
57
+ rescue
58
+ AppleEpf::Logging.logger.fatal "Failed to download and parse file #{filename}"
59
+ next
60
+ end
61
+ end
62
+
63
+ extracted_files
64
+ end
65
+
66
+ # will return array of filepath of extracted files
67
+ def download_and_extract(filename, extractables)
68
+ downloader = self.download(filename.to_s)
69
+ downloaded_file = downloader.download_to
70
+ self.extract(downloaded_file, extractables)
71
+ end
72
+
73
+ def download(filename)
74
+ downloader = AppleEpf::Downloader.new(type, filename.to_s, @filedate)
75
+ downloader.dirpath = @store_dir if @store_dir
76
+ downloader.download
77
+ downloader
78
+ end
79
+
80
+ def extract(downloaded_file, extractables)
81
+ extractor = AppleEpf::Extractor.new(downloaded_file, extractables)
82
+ extractor.keep_tbz_after_extract = @keep_tbz_after_extract if @keep_tbz_after_extract
83
+ extractor.perform
84
+ extractor.file_entry
85
+ end
86
+ end
87
+ end
88
+
89
+ class Incremental < Main
90
+ include BaseActions
91
+
92
+ def type
93
+ 'incremental'
94
+ end
95
+
96
+ def self.current_url
97
+ 'http://feeds.itunes.apple.com/feeds/epf/v3/full/current/incremental/current'
98
+ end
99
+ end
100
+
101
+ class Full < Main
102
+ include BaseActions
103
+
104
+ def type
105
+ 'full'
106
+ end
107
+
108
+ def self.current_url
109
+ 'http://feeds.itunes.apple.com/feeds/epf/v3/full/current'
110
+ end
111
+ end
112
+
113
+ end