rfilma 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7647d761f2d69f1209da458110a6aff8315e770a
4
+ data.tar.gz: 11503bae8df3a9ad5c102d12ebed554278b4406e
5
+ SHA512:
6
+ metadata.gz: a384e53fb27a1d3127a3ddb2e6879ad0bcdeebf66dc3c6bda41884f81273ce2ac7bce0c3317fccde60d519847c4745f7b779df707ff589689e5d3f3a588cb654
7
+ data.tar.gz: 146855783ab0c18329b0508ecfee91f9d8282cbe25f69f1d6f4c378c3ca24ef45f991a8591b77b010c3d4715d0996406398bfe2d6b974d6f040e341898c80d62
@@ -0,0 +1,8 @@
1
+ production:
2
+ sessions:
3
+ default:
4
+ database: filmadb
5
+ hosts:
6
+ - localhost:27017
7
+ options:
8
+ raise_not_found_error: false
@@ -0,0 +1,42 @@
1
+ require "rfilma/crawler"
2
+ require "rfilma/crawlerdb"
3
+ require "rfilma/pelicula"
4
+
5
+ class RFilma
6
+
7
+ attr_accessor :crawler, :crawlerdb
8
+
9
+ def initialize
10
+ @crawler = Crawler.new
11
+ @crawlerdb = CrawlerDB.new
12
+ end
13
+
14
+ def buscar_por_titulo(titulo,web=false)
15
+ if web
16
+ result = @crawler.buscar_por_titulo(titulo)
17
+ result.each{|a| @crawlerdb.guardar_pelicula(a["id"])}
18
+ else
19
+ result = @crawlerdb.buscar_por_titulo(titulo)
20
+ end
21
+ return result
22
+ end
23
+
24
+ # Entrada: 1->(A-Z) 2->(0-9) 3->(*)
25
+ def actualizar_por_letra(caracter)
26
+ pelis = []
27
+ if caracter.upcase.match(/([A-Z])/)
28
+ pelis = @crawlerdb.procesar_paginas(caracter.upcase.match(/([A-Z])/)[1])
29
+ elsif caracter.match(/([0-9])/)
30
+ pelis = @crawlerdb.procesar_paginas("0-9")
31
+ else
32
+ pelis = @crawlerdb.procesar_paginas("*")
33
+ end
34
+ @crawlerdb.guardar_peliculas(pelis)
35
+ end
36
+
37
+ def actualizar_todo
38
+ @crawlerdb.procesar_todo
39
+ end
40
+
41
+
42
+ end
@@ -0,0 +1,70 @@
1
+ require "mechanize"
2
+ require "set"
3
+ require "thread/pool"
4
+
5
+ class Crawler
6
+
7
+ def initialize
8
+ @a = Mechanize.new{|op|
9
+ op.user_agent_alias = "Windows Mozilla"
10
+ }
11
+ end
12
+
13
+
14
+ def obtener_pelicula(id)
15
+ data = {}
16
+ page = @a.get("http://www.filmaffinity.com/es/film#{id}.html").body
17
+ doc = Nokogiri::HTML(page)
18
+ data["id"] = id
19
+ data["titulo"] = doc.xpath("//h1[@id='main-title']/a/span").inner_html
20
+ data["puntuacion"] = doc.xpath('//div[@id="movie-rat-avg"]').text.strip.gsub(",",".").to_f
21
+ begin
22
+ data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/a')[0]["href"]
23
+ rescue
24
+ data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/img')[0]["src"]
25
+ end
26
+ doc.xpath('//dl[@class="movie-info"]/dt').each{|m|
27
+ dt = m.inner_html
28
+ case
29
+ when dt.include?("Título original")
30
+ data["titulo_original"] = m.next_element.text
31
+ when dt.include?("Año")
32
+ data["año"] = m.next_element.text.to_i
33
+ when dt.include?("Duración")
34
+ data["duracion"] = m.next_element.text.match('(\d*)')[1].to_i
35
+ when dt.include?("País")
36
+ data["pais"] = m.next_element.at('img')['title']
37
+ when dt.include?("Director")
38
+ data["director"] = m.next_element.search('a').map{|e| e.inner_html.strip}
39
+ when dt.include?("Guión")
40
+ data["guion"] = m.next_element.text.split(",").map{|e|e.strip}
41
+ when dt.include?("Música")
42
+ data["musica"] = m.next_element.text.split(",").map{|e|e.strip}
43
+ when dt.include?("Fotografía")
44
+ data["fotografia"] = m.next_element.text.split(",").map{|e|e.strip}
45
+ when dt.include?("Reparto")
46
+ data["reparto"] = m.next_element.text.split(",").map{|e|e.strip}
47
+ when dt.include?("Productora")
48
+ data["productora"] = m.next_element.text
49
+ when dt.include?("Género")
50
+ data["genero"] = m.next_element.search('a').map{|e| e.inner_html}
51
+ when dt.include?("Web")
52
+ data["web"] = m.next_element.text
53
+ when dt.include?("Sinopsis")
54
+ data["sinopsis"] = m.next_element.text
55
+ end
56
+ }
57
+ data
58
+ end
59
+
60
+ def buscar_por_titulo(titulo)
61
+ indices_pelis = []
62
+ p = @a.get("http://www.filmaffinity.com/es/search.php?stext=#{titulo.strip.gsub(" ","+")}&stype=title").body
63
+ doc = Nokogiri::HTML(p)
64
+ doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
65
+ indices_pelis << mc["data-movie-id"].to_i
66
+ }
67
+ indices_pelis.map{|i| obtener_pelicula(i)}
68
+ end
69
+
70
+ end
@@ -0,0 +1,66 @@
1
+ require_relative "pelicula"
2
+ require_relative "crawler"
3
+
4
+ class CrawlerDB < Crawler
5
+
6
+ def initialize
7
+ super
8
+ end
9
+
10
+ def obtener_pelicula(id)
11
+ Pelicula.where(id: id).as_json
12
+ end
13
+
14
+ def buscar_por_titulo(titulo)
15
+ Pelicula.where(titulo: /#{titulo}/i).as_json
16
+ end
17
+
18
+ def guardar_pelicula(id)
19
+ p = Crawler.new.obtener_pelicula(id)
20
+ m = Pelicula.new(p)
21
+ m.upsert
22
+ end
23
+
24
+ def guardar_peliculas(ids,nthread=5)
25
+ pool = Thread.pool(nthread)
26
+ ids2 = Pelicula.find(ids).each.map{|idd| idd["id"]}
27
+ ids3 = (ids - ids2) + (ids2 - ids)
28
+ ids3.each{|i|
29
+ pool.process{
30
+ guardar_pelicula(i)
31
+ }
32
+ }
33
+ pool.shutdown
34
+ end
35
+
36
+ def procesar_paginas(letra)
37
+ pagina = 1
38
+ # Cualquier categoría tiene más de una página
39
+ r = ">>"
40
+ indices_pelis = []
41
+ while r.include?(">>")
42
+ p = @a.get("http://www.filmaffinity.com/es/allfilms_#{letra}_#{pagina}.html").body
43
+ doc = Nokogiri::HTML(p)
44
+ r = doc.xpath('//div[@class="pager"]/a[contains(text(),">>")]').text
45
+ doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
46
+ indices_pelis << mc["data-movie-id"].to_i
47
+ }
48
+ pagina+=1
49
+ end
50
+ # Evitamos indices duplicados
51
+ Set.new(indices_pelis).to_a
52
+ end
53
+
54
+ def procesar_todo
55
+ cat = ('A'..'Z').to_a << "*" << "0-9"
56
+ pool = Thread.pool(5)
57
+ cat.each{|c|
58
+ pool.process{
59
+ ra = procesar_paginas(c)
60
+ guardar_peliculas(ra)
61
+ }
62
+ }
63
+ pool.shutdown
64
+ end
65
+ end
66
+
@@ -0,0 +1,28 @@
1
+ require "mongoid"
2
+
3
+ mongoidyml = File.join(File.dirname(__FILE__),"..","config","mongoid.yml")
4
+ Mongoid.load!(mongoidyml,:production)
5
+
6
+ class Pelicula
7
+
8
+ include Mongoid::Document
9
+
10
+ field :id, type: Integer
11
+ field :titulo, type: String
12
+ field :titulo_original, type: String
13
+ field :año, type: Integer
14
+ field :duracion, type: Integer
15
+ field :pais, type: String
16
+ field :director, type: Array
17
+ field :guion, type: Array
18
+ field :musica, type: Array
19
+ field :fotografia, type: Array
20
+ field :reparto, type: Array
21
+ field :productora, type: String
22
+ field :genero, type: Array
23
+ field :sinopsis, type: String
24
+ field :puntuacion, type: Float
25
+ field :web, type: String
26
+ field :portada, type: String
27
+ field :_id, type: Integer, overwrite: true, default: ->{ id }
28
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rfilma
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jose Antonio PB
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-08 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: RFilma is a Ruby library for crawl data from FilmAffinity website
14
+ email: aztuzeca@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/config/mongoid.yml
20
+ - lib/rfilma.rb
21
+ - lib/rfilma/crawler.rb
22
+ - lib/rfilma/crawlerdb.rb
23
+ - lib/rfilma/pelicula.rb
24
+ homepage: https://github.com/aztuzeca/rfilma
25
+ licenses:
26
+ - MIT
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.4.2
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: FilmAffinity Crawler
48
+ test_files: []
49
+ has_rdoc: