rfilma 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7647d761f2d69f1209da458110a6aff8315e770a
4
+ data.tar.gz: 11503bae8df3a9ad5c102d12ebed554278b4406e
5
+ SHA512:
6
+ metadata.gz: a384e53fb27a1d3127a3ddb2e6879ad0bcdeebf66dc3c6bda41884f81273ce2ac7bce0c3317fccde60d519847c4745f7b779df707ff589689e5d3f3a588cb654
7
+ data.tar.gz: 146855783ab0c18329b0508ecfee91f9d8282cbe25f69f1d6f4c378c3ca24ef45f991a8591b77b010c3d4715d0996406398bfe2d6b974d6f040e341898c80d62
@@ -0,0 +1,8 @@
1
+ production:
2
+ sessions:
3
+ default:
4
+ database: filmadb
5
+ hosts:
6
+ - localhost:27017
7
+ options:
8
+ raise_not_found_error: false
@@ -0,0 +1,42 @@
1
+ require "rfilma/crawler"
2
+ require "rfilma/crawlerdb"
3
+ require "rfilma/pelicula"
4
+
5
+ class RFilma
6
+
7
+ attr_accessor :crawler, :crawlerdb
8
+
9
+ def initialize
10
+ @crawler = Crawler.new
11
+ @crawlerdb = CrawlerDB.new
12
+ end
13
+
14
+ def buscar_por_titulo(titulo,web=false)
15
+ if web
16
+ result = @crawler.buscar_por_titulo(titulo)
17
+ result.each{|a| @crawlerdb.guardar_pelicula(a["id"])}
18
+ else
19
+ result = @crawlerdb.buscar_por_titulo(titulo)
20
+ end
21
+ return result
22
+ end
23
+
24
+ # Entrada: 1->(A-Z) 2->(0-9) 3->(*)
25
+ def actualizar_por_letra(caracter)
26
+ pelis = []
27
+ if caracter.upcase.match(/([A-Z])/)
28
+ pelis = @crawlerdb.procesar_paginas(caracter.upcase.match(/([A-Z])/)[1])
29
+ elsif caracter.match(/([0-9])/)
30
+ pelis = @crawlerdb.procesar_paginas("0-9")
31
+ else
32
+ pelis = @crawlerdb.procesar_paginas("*")
33
+ end
34
+ @crawlerdb.guardar_peliculas(pelis)
35
+ end
36
+
37
+ def actualizar_todo
38
+ @crawlerdb.procesar_todo
39
+ end
40
+
41
+
42
+ end
@@ -0,0 +1,70 @@
1
+ require "mechanize"
2
+ require "set"
3
+ require "thread/pool"
4
+
5
+ class Crawler
6
+
7
+ def initialize
8
+ @a = Mechanize.new{|op|
9
+ op.user_agent_alias = "Windows Mozilla"
10
+ }
11
+ end
12
+
13
+
14
+ def obtener_pelicula(id)
15
+ data = {}
16
+ page = @a.get("http://www.filmaffinity.com/es/film#{id}.html").body
17
+ doc = Nokogiri::HTML(page)
18
+ data["id"] = id
19
+ data["titulo"] = doc.xpath("//h1[@id='main-title']/a/span").inner_html
20
+ data["puntuacion"] = doc.xpath('//div[@id="movie-rat-avg"]').text.strip.gsub(",",".").to_f
21
+ begin
22
+ data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/a')[0]["href"]
23
+ rescue
24
+ data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/img')[0]["src"]
25
+ end
26
+ doc.xpath('//dl[@class="movie-info"]/dt').each{|m|
27
+ dt = m.inner_html
28
+ case
29
+ when dt.include?("Título original")
30
+ data["titulo_original"] = m.next_element.text
31
+ when dt.include?("Año")
32
+ data["año"] = m.next_element.text.to_i
33
+ when dt.include?("Duración")
34
+ data["duracion"] = m.next_element.text.match('(\d*)')[1].to_i
35
+ when dt.include?("País")
36
+ data["pais"] = m.next_element.at('img')['title']
37
+ when dt.include?("Director")
38
+ data["director"] = m.next_element.search('a').map{|e| e.inner_html.strip}
39
+ when dt.include?("Guión")
40
+ data["guion"] = m.next_element.text.split(",").map{|e|e.strip}
41
+ when dt.include?("Música")
42
+ data["musica"] = m.next_element.text.split(",").map{|e|e.strip}
43
+ when dt.include?("Fotografía")
44
+ data["fotografia"] = m.next_element.text.split(",").map{|e|e.strip}
45
+ when dt.include?("Reparto")
46
+ data["reparto"] = m.next_element.text.split(",").map{|e|e.strip}
47
+ when dt.include?("Productora")
48
+ data["productora"] = m.next_element.text
49
+ when dt.include?("Género")
50
+ data["genero"] = m.next_element.search('a').map{|e| e.inner_html}
51
+ when dt.include?("Web")
52
+ data["web"] = m.next_element.text
53
+ when dt.include?("Sinopsis")
54
+ data["sinopsis"] = m.next_element.text
55
+ end
56
+ }
57
+ data
58
+ end
59
+
60
+ def buscar_por_titulo(titulo)
61
+ indices_pelis = []
62
+ p = @a.get("http://www.filmaffinity.com/es/search.php?stext=#{titulo.strip.gsub(" ","+")}&stype=title").body
63
+ doc = Nokogiri::HTML(p)
64
+ doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
65
+ indices_pelis << mc["data-movie-id"].to_i
66
+ }
67
+ indices_pelis.map{|i| obtener_pelicula(i)}
68
+ end
69
+
70
+ end
@@ -0,0 +1,66 @@
1
+ require_relative "pelicula"
2
+ require_relative "crawler"
3
+
4
+ class CrawlerDB < Crawler
5
+
6
+ def initialize
7
+ super
8
+ end
9
+
10
+ def obtener_pelicula(id)
11
+ Pelicula.where(id: id).as_json
12
+ end
13
+
14
+ def buscar_por_titulo(titulo)
15
+ Pelicula.where(titulo: /#{titulo}/i).as_json
16
+ end
17
+
18
+ def guardar_pelicula(id)
19
+ p = Crawler.new.obtener_pelicula(id)
20
+ m = Pelicula.new(p)
21
+ m.upsert
22
+ end
23
+
24
+ def guardar_peliculas(ids,nthread=5)
25
+ pool = Thread.pool(nthread)
26
+ ids2 = Pelicula.find(ids).each.map{|idd| idd["id"]}
27
+ ids3 = (ids - ids2) + (ids2 - ids)
28
+ ids3.each{|i|
29
+ pool.process{
30
+ guardar_pelicula(i)
31
+ }
32
+ }
33
+ pool.shutdown
34
+ end
35
+
36
+ def procesar_paginas(letra)
37
+ pagina = 1
38
+ # Cualquier categoría tiene más de una página
39
+ r = ">>"
40
+ indices_pelis = []
41
+ while r.include?(">>")
42
+ p = @a.get("http://www.filmaffinity.com/es/allfilms_#{letra}_#{pagina}.html").body
43
+ doc = Nokogiri::HTML(p)
44
+ r = doc.xpath('//div[@class="pager"]/a[contains(text(),">>")]').text
45
+ doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
46
+ indices_pelis << mc["data-movie-id"].to_i
47
+ }
48
+ pagina+=1
49
+ end
50
+ # Evitamos indices duplicados
51
+ Set.new(indices_pelis).to_a
52
+ end
53
+
54
+ def procesar_todo
55
+ cat = ('A'..'Z').to_a << "*" << "0-9"
56
+ pool = Thread.pool(5)
57
+ cat.each{|c|
58
+ pool.process{
59
+ ra = procesar_paginas(c)
60
+ guardar_peliculas(ra)
61
+ }
62
+ }
63
+ pool.shutdown
64
+ end
65
+ end
66
+
@@ -0,0 +1,28 @@
1
+ require "mongoid"
2
+
3
+ mongoidyml = File.join(File.dirname(__FILE__),"..","config","mongoid.yml")
4
+ Mongoid.load!(mongoidyml,:production)
5
+
6
+ class Pelicula
7
+
8
+ include Mongoid::Document
9
+
10
+ field :id, type: Integer
11
+ field :titulo, type: String
12
+ field :titulo_original, type: String
13
+ field :año, type: Integer
14
+ field :duracion, type: Integer
15
+ field :pais, type: String
16
+ field :director, type: Array
17
+ field :guion, type: Array
18
+ field :musica, type: Array
19
+ field :fotografia, type: Array
20
+ field :reparto, type: Array
21
+ field :productora, type: String
22
+ field :genero, type: Array
23
+ field :sinopsis, type: String
24
+ field :puntuacion, type: Float
25
+ field :web, type: String
26
+ field :portada, type: String
27
+ field :_id, type: Integer, overwrite: true, default: ->{ id }
28
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rfilma
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jose Antonio PB
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-08 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: RFilma is a Ruby library for crawl data from FilmAffinity website
14
+ email: aztuzeca@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/config/mongoid.yml
20
+ - lib/rfilma.rb
21
+ - lib/rfilma/crawler.rb
22
+ - lib/rfilma/crawlerdb.rb
23
+ - lib/rfilma/pelicula.rb
24
+ homepage: https://github.com/aztuzeca/rfilma
25
+ licenses:
26
+ - MIT
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.4.2
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: FilmAffinity Crawler
48
+ test_files: []
49
+ has_rdoc: