imdb-html 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ module IMDB
2
+
3
+ BASE_URI = 'www.imdb.com'
4
+
5
+ end
@@ -0,0 +1,41 @@
1
+ module IMDB
2
+ module Data
3
+ class Credit
4
+ include IMDB::REGEXP::HTML
5
+
6
+ def initialize noko_tr
7
+ @tr = noko_tr
8
+ end
9
+ attr_reader :tr
10
+
11
+ def name
12
+ tr.at('td.nm').text
13
+ end
14
+ def nm
15
+ tr.at('td.nm a')["href"][ NM ]
16
+ end
17
+ def roles
18
+ roles, parens = tr.at('td.char').inner_html.split( SPLIT_BEFORE_PAREN )
19
+ roles = "" unless roles
20
+ parens = "" unless parens
21
+
22
+ etc = parens.scan( IN_PARENS ).flatten
23
+ roles.split(" / ").map do |role|
24
+
25
+ name_of_role = role.gsub(%r{<.*?>},"").strip
26
+ role_ch = role[ CH ]
27
+
28
+ hash = {:char => name_of_role}
29
+ hash[:ch] = role_ch.to_i if role_ch
30
+ etc.each{ |thing| hash[ thing.to_sym ] = true }
31
+ hash
32
+ end
33
+ end
34
+
35
+
36
+
37
+
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,63 @@
1
+ module IMDB
2
+ module Data
3
+ class Episode
4
+ include IMDB::REGEXP
5
+
6
+ def self.parse_params noko_h4
7
+ noko_h4.episodes_h4.map do |h4|
8
+ self.new(h4).to_h
9
+ end
10
+ end
11
+
12
+
13
+
14
+ def initialize noko_h4
15
+ @h4 = noko_h4
16
+ end
17
+ attr_reader :h4
18
+
19
+ def season_number
20
+ num = h4.text[ CONTENT::SEASON ]
21
+ num ? num.rjust(2,'0') : "??"
22
+ end
23
+ def episode_number
24
+ num = h4.text[ CONTENT::EPISODE ]
25
+ num ? num.rjust(2,'0') : "??"
26
+ end
27
+ def se
28
+ ["S", season_number, "E", episode_number].join
29
+ end
30
+ def title
31
+ h4.at('a').text
32
+ end
33
+ def tt
34
+ h4.at('a')["href"][ HTML::TT ]
35
+ end
36
+ def date
37
+ Date.parse h4.next_element.text
38
+ end
39
+ def description
40
+ h4.next_element.next_element.next.text
41
+ end
42
+ def cast
43
+ h4.next_element.next_element.next_element.next_element.cast.map do |elem|
44
+ credit = IMDB::Data::Credit.new elem
45
+ {:name => credit.name, :nm => credit.nm.nil? ? 0 : credit.nm.to_i, :roles => credit.roles}
46
+ end rescue nil
47
+ end
48
+ def to_h
49
+ hash = {:se => se, :title => title}
50
+ hash[:tt] = tt.to_i if tt
51
+ hash[:air_date] = date
52
+ hash[:description] = description.empty? ? nil : description
53
+ hash[:cast] = cast
54
+ hash
55
+ end
56
+
57
+
58
+
59
+
60
+
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,48 @@
1
+ module IMDB
2
+ module Data
3
+ class Show
4
+ include HTTParty
5
+ base_uri 'www.imdb.com'
6
+ headers 'Content-Type' => 'utf-8'
7
+
8
+ def initialize tt
9
+ @tt = tt.to_i
10
+ end
11
+
12
+ def tt
13
+ @tt.to_s.rjust 7, '0'
14
+ end
15
+ def path *args
16
+ File.join *["/title/tt#{tt}", args].flatten
17
+ end
18
+ def http *args
19
+ self.class.get path *args
20
+ end
21
+ def noko *args
22
+ Nokogiri::HTML http *args
23
+ end
24
+
25
+
26
+ def episodes
27
+ Episode.parse_params( noko "epcast" )
28
+ end
29
+
30
+ def json_path
31
+ File.join "/home/mdt/g/imdb-memo/json/#{tt}.json"
32
+ end
33
+
34
+ def self.process tt
35
+ show = Show.new tt
36
+ File.open show.json_path, "w" do |j|
37
+ j.puts JSON.pretty_generate( show.episodes )
38
+ end
39
+ end
40
+
41
+
42
+
43
+
44
+
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,18 @@
1
+ require 'nokogiri'
2
+ require 'httparty'
3
+
4
+ require 'imdb-html/const'
5
+ require 'imdb-html/regexp/html'
6
+ require 'imdb-html/regexp/content'
7
+
8
+ require 'imdb-html/nokogiri/node'
9
+
10
+ require 'imdb-html/data/credit'
11
+ require 'imdb-html/data/episode'
12
+ require 'imdb-html/data/show'
13
+
14
+
15
+ module IMDB
16
+
17
+
18
+ end
@@ -0,0 +1,29 @@
1
+ module IMDB
2
+ class Nokogiri::XML::Node
3
+
4
+ def cast
5
+ pattern = %w{td.ddd td.ellipsis}.detect{ |css_pattern| at(css_pattern) }
6
+ css(pattern).map(&:parent) if pattern
7
+ end
8
+
9
+ def tn15content
10
+ at('div#tn15content')
11
+ end
12
+
13
+ def p_seasons
14
+ tn15content.at('p').css('a').map{ |a| a["href"][/#season-\K.+/] }.compact
15
+ end
16
+
17
+ def episodes_h4
18
+ name == "h4" ? [self] : css('h4')
19
+ end
20
+
21
+ def link
22
+ css('@href').count == 1 ? at('@href').value : nil
23
+ end
24
+
25
+
26
+
27
+
28
+ end
29
+ end
@@ -0,0 +1,32 @@
1
+ module IMDB
2
+ module Page
3
+ class BasicPage
4
+ include HTTParty
5
+ base_uri IMDB::BASE_URI
6
+ include IMDB::REGEXP::HTML
7
+
8
+ def self.stub(stub_str)
9
+ self.base_uri File.join self.base_uri, stub_str
10
+ end
11
+
12
+
13
+
14
+ def target raw_html
15
+ raw_html[ DIV_ID_PAGECONTENT ][ DIV_ID_FOOTER ]
16
+ .sub(BILLBOARD, "")
17
+ .gsub(BR, "")
18
+ end
19
+
20
+ def noko
21
+ Nokogiri::HTML.parse target
22
+ end
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,14 @@
1
+ module IMDB
2
+ module REGEXP
3
+ module CONTENT
4
+
5
+ SEASON = /season \K[0-9]+/i
6
+ EPISODE = /episode \K[0-9]+/i
7
+ SxE = /[Ss]?([0-9]+)[\.EeXx]([0-9]+)/
8
+
9
+
10
+
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,36 @@
1
+ module IMDB
2
+ module REGEXP
3
+ module HTML
4
+ DIV_ID_FOOTER = %r{^\s+<div id="footer".+}m
5
+ DIV_ID_PAGECONTENT = %r{^\s+<div id="pagecontent">.+}m
6
+ LINK_REL_CANONICAL = %r{^\s+<link rel="canonical" href="(.*?)">}
7
+ TITLE = %r{<title>(.*?)</title>}
8
+ META_CONTENT_TYPE = %r{<meta http-equiv="content-type" content="(.*?)">}
9
+
10
+ BILLBOARD = %r{<!-- begin injectable INJECTED_BILLBOARD.*?<!-- _get_ad_for_slot(INJECTED_BILLBOARD) -->}m
11
+ BR = %r{<br.*?>}
12
+
13
+
14
+ TT = %r{/title/tt\K[0-9]+}
15
+ NM = %r{/name/nm\K[0-9]+}
16
+ CH = %r{/character/ch\K[0-9]+}
17
+
18
+ TD_NM = %r{<td class="nm"><a href="/name/nm([0-9]+)/">(.*?)</a></td>}
19
+ TD_DDD = %r{<td class="ddd"> \.\.\. </td>}
20
+ IN_PARENS = %r{\((.*?)\)}
21
+ CHAR_CREDIT = %r{<a href="/character/ch([0-9]+)/">(.*?)</a>(?:\s+#{ IN_PARENS })*}
22
+ TD_CHAR = %r{<td class="char">(?:#{ CHAR_CREDIT })(?: / #{ CHAR_CREDIT })*</td> }
23
+
24
+ LINK = %r{<a href="/([a-z]+)/([a-z]{2})([0-9]+)/.*?".?>\s*?(\S.*?)</a>}
25
+ LINK_PLUS_PARENS = %r{#{LINK}(?:\s*?#{IN_PARENS})*}
26
+
27
+ SPLIT_BEFORE_PAREN = %r{ (?=\()}
28
+
29
+
30
+
31
+
32
+
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ require 'simplecov'
2
+
3
+ module SimpleCov::Configuration
4
+ def clean_filters
5
+ @filters = []
6
+ end
7
+ end
8
+
9
+ SimpleCov.configure do
10
+ clean_filters
11
+ load_adapter 'test_frameworks'
12
+ end
13
+
14
+ ENV["COVERAGE"] && SimpleCov.start do
15
+ add_filter "/.rvm/"
16
+ end
17
+ require 'rubygems'
18
+ require 'bundler'
19
+ begin
20
+ Bundler.setup(:default, :development)
21
+ rescue Bundler::BundlerError => e
22
+ $stderr.puts e.message
23
+ $stderr.puts "Run `bundle install` to install missing gems"
24
+ exit e.status_code
25
+ end
26
+ require 'test/unit'
27
+ require 'shoulda'
28
+
29
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
30
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
31
+ require 'imdb-html'
32
+
33
+ class Test::Unit::TestCase
34
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestImdbHtml < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,190 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: imdb-html
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - michael d. towle
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '4.1'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '4.1'
55
+ - !ruby/object:Gem::Dependency
56
+ name: httparty
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.13'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.13'
69
+ - !ruby/object:Gem::Dependency
70
+ name: shoulda
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.5'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rdoc
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.12'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.12'
97
+ - !ruby/object:Gem::Dependency
98
+ name: bundler
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.6'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.6'
111
+ - !ruby/object:Gem::Dependency
112
+ name: jeweler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '2.0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.8'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.8'
139
+ description: '2'
140
+ email: gametheoretic@gmail.com
141
+ executables: []
142
+ extensions: []
143
+ extra_rdoc_files:
144
+ - LICENSE.txt
145
+ - README.rdoc
146
+ files:
147
+ - ".document"
148
+ - Gemfile
149
+ - Gemfile.lock
150
+ - LICENSE.txt
151
+ - README.rdoc
152
+ - Rakefile
153
+ - lib/imdb-html.rb
154
+ - lib/imdb-html/codes_todo.csv
155
+ - lib/imdb-html/const.rb
156
+ - lib/imdb-html/data/credit.rb
157
+ - lib/imdb-html/data/episode.rb
158
+ - lib/imdb-html/data/show.rb
159
+ - lib/imdb-html/imdb.rb
160
+ - lib/imdb-html/nokogiri/node.rb
161
+ - lib/imdb-html/page/basic_page.rb
162
+ - lib/imdb-html/regexp/content.rb
163
+ - lib/imdb-html/regexp/html.rb
164
+ - test/helper.rb
165
+ - test/test_imdb-html.rb
166
+ homepage: http://github.com/gametheoretic/imdb-html
167
+ licenses:
168
+ - MIT
169
+ metadata: {}
170
+ post_install_message:
171
+ rdoc_options: []
172
+ require_paths:
173
+ - lib
174
+ required_ruby_version: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ required_rubygems_version: !ruby/object:Gem::Requirement
180
+ requirements:
181
+ - - ">="
182
+ - !ruby/object:Gem::Version
183
+ version: '0'
184
+ requirements: []
185
+ rubyforge_project:
186
+ rubygems_version: 2.2.2
187
+ signing_key:
188
+ specification_version: 4
189
+ summary: '1'
190
+ test_files: []