sjunkieex 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +7 -0
- data/LICENSE +18 -0
- data/README.md +22 -0
- data/Rakefile +9 -0
- data/bin/sjunkieex +122 -0
- data/lib/sjunkieex/interface.rb +147 -0
- data/lib/sjunkieex/series_index.rb +134 -0
- data/lib/sjunkieex/version.rb +3 -0
- data/lib/sjunkieex.rb +7 -0
- data/sjunkieex.gemspec +20 -0
- data/test/seriesindex_example.xml +4568 -0
- data/test/site_dumps/chase.html +757 -0
- data/test/site_dumps/homepage.html +3345 -0
- data/test/test_helper.rb +8 -0
- data/test/test_interface.rb +53 -0
- data/test/test_series_index.rb +69 -0
- metadata +101 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
(General Public License)
|
3
|
+
|
4
|
+
Copyright (c) 2012 Philipp Böhm
|
5
|
+
|
6
|
+
This program is free software; you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation in version 3 of the License.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
18
|
+
MA 02110-1301, USA.
|
data/README.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Sjunkieex (Serienjunkies Extractor)
|
2
|
+
|
3
|
+
Program that extracts links from serienjunkies.org for your series and
|
4
|
+
searches for new episodes for existing series.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Install it yourself with (add sudo for systemwide installation):
|
9
|
+
|
10
|
+
$ gem install sjunkieex
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
TODO: Write usage instructions here
|
15
|
+
|
16
|
+
## Contributing
|
17
|
+
|
18
|
+
1. Fork it
|
19
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
20
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
21
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
22
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bin/sjunkieex
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
# encoding: UTF-8
|
4
|
+
|
5
|
+
$LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
|
7
|
+
require 'sjunkieex'
|
8
|
+
require 'hashconfig'
|
9
|
+
require 'optparse'
|
10
|
+
require 'fileutils'
|
11
|
+
|
12
|
+
# create program configuration dirs/files
|
13
|
+
CONFIG_DIR = File.join( File.expand_path("~"), ".sjunkieex" )
|
14
|
+
CONFIG_FILE = File.join( CONFIG_DIR, "config.yml" )
|
15
|
+
FileUtils.mkdir(CONFIG_DIR) unless File.directory?(CONFIG_DIR)
|
16
|
+
|
17
|
+
###
|
18
|
+
# configuration
|
19
|
+
STANDARD_CONFIG = {
|
20
|
+
:hd_series => ["SERIES_NOT_EXISTING_PLACEHOLDER"],
|
21
|
+
:index_directory => File.join(CONFIG_DIR, ".index/"),
|
22
|
+
:index_suffix => "xml",
|
23
|
+
:hoster_id => "nl",
|
24
|
+
:dump_links => true,
|
25
|
+
:dump_file => "/tmp/gsl_links.txt",
|
26
|
+
}.merge(Sjunkieex::Interface::STANDARD_CONFIG)
|
27
|
+
|
28
|
+
config = STANDARD_CONFIG.merge_with_serialized(CONFIG_FILE)
|
29
|
+
|
30
|
+
###
|
31
|
+
# option definition and handling
|
32
|
+
options = {}
|
33
|
+
OptionParser.new do |opts|
|
34
|
+
opts.banner = "Usage: #{File.basename($PROGRAM_NAME)}"
|
35
|
+
|
36
|
+
opts.separator("")
|
37
|
+
opts.separator("Tool that extracts episodelinks from serienjunkies.org.")
|
38
|
+
opts.separator("")
|
39
|
+
opts.separator(" Options:")
|
40
|
+
|
41
|
+
# opts.on( "-i", "--ignore-seriesinfo",
|
42
|
+
# "do not use the information from the infostore") do |opt|
|
43
|
+
# config[:read_episode_info] = false
|
44
|
+
# end
|
45
|
+
|
46
|
+
opts.on( "-v", "--version",
|
47
|
+
"Outputs the version number.") do |opt|
|
48
|
+
puts Sjunkieex::VERSION
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
|
52
|
+
opts.separator("")
|
53
|
+
|
54
|
+
end.parse!
|
55
|
+
|
56
|
+
fail "index directory #{ config[:index_directory]} does not exist" unless
|
57
|
+
File.directory?(config[:index_directory])
|
58
|
+
|
59
|
+
|
60
|
+
glob_pattern = File.join(config[:index_directory] ,"*.#{config[:index_suffix]}")
|
61
|
+
files = Dir[glob_pattern]
|
62
|
+
|
63
|
+
series_index = Sjunkieex::SeriesIndex.new(files: files)
|
64
|
+
|
65
|
+
fail "there is data for series existing" if series_index.empty?
|
66
|
+
|
67
|
+
###
|
68
|
+
# look for new series
|
69
|
+
dump_links = []
|
70
|
+
interface = Sjunkieex::Interface.new(series_index, config)
|
71
|
+
|
72
|
+
interface.look_for_new_episodes.each do |link,series|
|
73
|
+
puts "\nLook for new episodes in '#{series}'"
|
74
|
+
|
75
|
+
links = interface.parse_series_page(series, link)
|
76
|
+
links.each do |identifier, link_data|
|
77
|
+
puts link_data[:episodedata]
|
78
|
+
|
79
|
+
hd = false
|
80
|
+
(hd = true) if config[:hd_series].include? series
|
81
|
+
|
82
|
+
###
|
83
|
+
# select links, depending on wanted resolution
|
84
|
+
links = []
|
85
|
+
if hd
|
86
|
+
if link_data[:hd_1080p]
|
87
|
+
links = link_data[:hd_1080p]
|
88
|
+
elsif link_data[:hd_720p]
|
89
|
+
links = link_data[:hd_720p]
|
90
|
+
end
|
91
|
+
else
|
92
|
+
if link_data[:sd]
|
93
|
+
links = link_data[:sd]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
if links.empty?
|
98
|
+
puts "there are no links in this resolution"
|
99
|
+
next
|
100
|
+
end
|
101
|
+
|
102
|
+
download_links = links.select do |link|
|
103
|
+
link.match(/\/f-\w+\/#{ config[:hoster_id] }_/)
|
104
|
+
end
|
105
|
+
|
106
|
+
if download_links.empty?
|
107
|
+
puts "there are no links for this hoster"
|
108
|
+
next
|
109
|
+
end
|
110
|
+
|
111
|
+
print download_links[0].to_yaml
|
112
|
+
dump_links << download_links[0]
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
###
|
117
|
+
# dump links to file
|
118
|
+
if config[:dump_links]
|
119
|
+
File.open(config[:dump_file], 'w') do |f|
|
120
|
+
dump_links.each { |l| f.write(l + "\n")}
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module Sjunkieex
|
5
|
+
|
6
|
+
class Interface
|
7
|
+
|
8
|
+
STANDARD_CONFIG = {
|
9
|
+
url: "http://serienjunkies.org",
|
10
|
+
german_only: true,
|
11
|
+
hd: false,
|
12
|
+
subbed_allowed: false,
|
13
|
+
}
|
14
|
+
|
15
|
+
attr_reader :options
|
16
|
+
|
17
|
+
def initialize(series_index, options = {})
|
18
|
+
@options = STANDARD_CONFIG.merge(options)
|
19
|
+
@index = series_index
|
20
|
+
end
|
21
|
+
|
22
|
+
# Public: Looks for new episodes on the homepage
|
23
|
+
#
|
24
|
+
# Returns a Hash of links for sites that should be visited
|
25
|
+
def look_for_new_episodes
|
26
|
+
links = Hash.new
|
27
|
+
|
28
|
+
doc = Nokogiri::XML(get_page_data(@options[:url]))
|
29
|
+
doc.css("div#content > div.post > div.post-content a").each do |link|
|
30
|
+
c = link.content
|
31
|
+
|
32
|
+
####
|
33
|
+
# skip links that are not suitable
|
34
|
+
next unless is_link_useful?(c)
|
35
|
+
|
36
|
+
next unless @index.is_series_in_index?(c)
|
37
|
+
|
38
|
+
series_name = Sjunkieex::SeriesIndex.extract_seriesname(c)
|
39
|
+
next unless series_name
|
40
|
+
|
41
|
+
next if @index.episode_existing?(series_name, c)
|
42
|
+
|
43
|
+
href = link[:href]
|
44
|
+
next if links.include?(href)
|
45
|
+
|
46
|
+
links[href] = series_name
|
47
|
+
end
|
48
|
+
|
49
|
+
return links
|
50
|
+
end
|
51
|
+
|
52
|
+
# Public: parses a series page and extracts links
|
53
|
+
#
|
54
|
+
# series_name - the series name and the key in the index
|
55
|
+
# series_link - the link to the page
|
56
|
+
#
|
57
|
+
# Returns a hash indexed by series identifier
|
58
|
+
def parse_series_page(series_name, series_link)
|
59
|
+
|
60
|
+
link_data = Hash.new
|
61
|
+
|
62
|
+
doc = Nokogiri::XML(get_page_data(series_link))
|
63
|
+
doc.css("div#content > div.post > div.post-content p").each do |paragraph|
|
64
|
+
|
65
|
+
next if paragraph[:class]
|
66
|
+
|
67
|
+
episode_data = paragraph.css("strong:first-child").text
|
68
|
+
next unless is_link_useful?(episode_data)
|
69
|
+
|
70
|
+
next if @index.episode_existing?(series_name, episode_data)
|
71
|
+
|
72
|
+
if id = Sjunkieex::SeriesIndex.extract_episode_identifier(episode_data)
|
73
|
+
|
74
|
+
# classify episode resolution
|
75
|
+
resolution = :sd
|
76
|
+
(resolution = :hd_720p) if episode_data.match(/720[pi]/i)
|
77
|
+
(resolution = :hd_1080p) if episode_data.match(/1080[pi]/i)
|
78
|
+
|
79
|
+
# extract hoster links
|
80
|
+
episode_links = []
|
81
|
+
paragraph.css("a").each do |link|
|
82
|
+
episode_links << link[:href]
|
83
|
+
end
|
84
|
+
|
85
|
+
(link_data[id] = Hash.new) unless link_data[id]
|
86
|
+
link_data[id][resolution] = episode_links
|
87
|
+
link_data[id][:episodedata] = episode_data
|
88
|
+
link_data[id][:series] = series_name
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
return link_data
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
# Internal: check the link data against criterias
|
98
|
+
#
|
99
|
+
# link_data - data for the link
|
100
|
+
#
|
101
|
+
# Returns true if the link is useful or false if it can be skipped
|
102
|
+
def is_link_useful?(link_data)
|
103
|
+
|
104
|
+
return false unless link_data.match(/S\w+E\w+/i)
|
105
|
+
|
106
|
+
# skip links depending on language
|
107
|
+
if @options[:german_only]
|
108
|
+
return false unless link_data.match(/German/i)
|
109
|
+
|
110
|
+
unless @options[:subbed_allowed]
|
111
|
+
return false if link_data.match(/Subbed/i)
|
112
|
+
end
|
113
|
+
else
|
114
|
+
return false if link_data.match(/German/i)
|
115
|
+
end
|
116
|
+
|
117
|
+
true
|
118
|
+
end
|
119
|
+
|
120
|
+
# Internal: get a page and do some stuff if the page is gzip encoded
|
121
|
+
#
|
122
|
+
# link - the link that is fetched
|
123
|
+
#
|
124
|
+
# Returns the page content
|
125
|
+
def get_page_data(link)
|
126
|
+
|
127
|
+
body = nil
|
128
|
+
|
129
|
+
stream = open(link)
|
130
|
+
if stream.is_a? File
|
131
|
+
# file is a local file, has not methods below
|
132
|
+
body = stream.read
|
133
|
+
else
|
134
|
+
# file is web uri
|
135
|
+
if (stream.content_encoding.empty?)
|
136
|
+
body = stream.read
|
137
|
+
else
|
138
|
+
body = Zlib::GzipReader.new(stream).read
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
return body
|
143
|
+
end
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Sjunkieex
|
4
|
+
|
5
|
+
class SeriesIndex
|
6
|
+
|
7
|
+
attr_reader :options, :series_data
|
8
|
+
|
9
|
+
# Public: instantiate a new series_index
|
10
|
+
#
|
11
|
+
# options - Options (default: {})
|
12
|
+
# :files - Array of series indizes
|
13
|
+
#
|
14
|
+
def initialize(options = {})
|
15
|
+
@options = {files: [], }.merge(options)
|
16
|
+
|
17
|
+
@series_data = Hash.new
|
18
|
+
@options[:files].each do |file|
|
19
|
+
@series_data.merge!(parse_file(file))
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
# Public: checks if there are entries in the index
|
25
|
+
#
|
26
|
+
# Returns true if there no entries loaded
|
27
|
+
def empty?
|
28
|
+
@series_data.length == 0
|
29
|
+
end
|
30
|
+
|
31
|
+
# Public: Check if a supplied episode is in the index
|
32
|
+
#
|
33
|
+
# series_name - Name of the series in the index
|
34
|
+
# episode_text - episode data
|
35
|
+
#
|
36
|
+
# Returns true if the episode is existing, false otherwise
|
37
|
+
def episode_existing?(series_name, episode_text)
|
38
|
+
if @series_data[series_name]
|
39
|
+
|
40
|
+
if id = SeriesIndex.extract_episode_identifier(episode_text)
|
41
|
+
if @series_data[series_name][id]
|
42
|
+
return true
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
|
50
|
+
# Public: checks if the seriesname in the supplied data is in the index
|
51
|
+
#
|
52
|
+
# episode_text - data that contains the episode information
|
53
|
+
#
|
54
|
+
# Returns true if the series is in the index, false otherwise
|
55
|
+
def is_series_in_index?(episode_text)
|
56
|
+
|
57
|
+
if series_name = SeriesIndex.extract_seriesname(episode_text)
|
58
|
+
if @series_data[series_name]
|
59
|
+
return true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
return false
|
64
|
+
end
|
65
|
+
|
66
|
+
# Public: tries to extract the seriesname from supplied data
|
67
|
+
#
|
68
|
+
# data - data that holds the episode information
|
69
|
+
#
|
70
|
+
# Returns the seriesname or nil if there is no seriesname
|
71
|
+
def self.extract_seriesname(data)
|
72
|
+
if md = data.match(/(.*)S\d+E\d+/)
|
73
|
+
return md[1].gsub(/\./, " ").strip
|
74
|
+
end
|
75
|
+
nil
|
76
|
+
end
|
77
|
+
|
78
|
+
# Public: tries to extract the episode identifier from the episode data
|
79
|
+
#
|
80
|
+
# data - data that holds the episode information
|
81
|
+
#
|
82
|
+
# Returns the identifier xx_xx or nil if there is no identifier
|
83
|
+
def self.extract_episode_identifier(data)
|
84
|
+
if md = data.match(/S(\d+)E(\d+)/i)
|
85
|
+
return "%s_%s" % [md[1].to_i, md[2].to_i]
|
86
|
+
end
|
87
|
+
nil
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
# Internal: parse this file to a hash indexed by seriesname
|
93
|
+
#
|
94
|
+
# file - path to the xml file
|
95
|
+
#
|
96
|
+
# Returns a Hash indexed by seriesname with Hashes as values
|
97
|
+
#
|
98
|
+
# hash = {
|
99
|
+
# "Chase": {
|
100
|
+
# "1_1": "S01E01 - test.avi",
|
101
|
+
# }
|
102
|
+
# }
|
103
|
+
def parse_file(file)
|
104
|
+
|
105
|
+
series_data = Hash.new
|
106
|
+
|
107
|
+
content = File.open(file, "r").read
|
108
|
+
doc = Nokogiri::XML(content)
|
109
|
+
|
110
|
+
doc.css("serienindex > directory").each do |series_node|
|
111
|
+
|
112
|
+
title = series_node[:name]
|
113
|
+
next unless title && title.match(/\w+/)
|
114
|
+
|
115
|
+
series = Hash.new
|
116
|
+
series_node.css("file").each do |file_node|
|
117
|
+
|
118
|
+
filename = file_node[:name]
|
119
|
+
next unless filename
|
120
|
+
|
121
|
+
if id = SeriesIndex.extract_episode_identifier(filename)
|
122
|
+
series[id] = filename
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
series_data[title] = series
|
127
|
+
end
|
128
|
+
|
129
|
+
series_data
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
data/lib/sjunkieex.rb
ADDED
data/sjunkieex.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/sjunkieex/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Philipp Böhm"]
|
6
|
+
gem.email = ["philipp@i77i.de"]
|
7
|
+
gem.description = %q{Tool that extracts links from serienjunkies.org}
|
8
|
+
gem.summary = %q{serienjunkies.org link extractor}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "sjunkieex"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Sjunkieex::VERSION
|
17
|
+
|
18
|
+
gem.add_runtime_dependency(%q<nokogiri>, [">= 1.5"])
|
19
|
+
gem.add_runtime_dependency(%q<hashconfig>, [">= 0.0.1"])
|
20
|
+
end
|