rfilma 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/config/mongoid.yml +8 -0
- data/lib/rfilma.rb +42 -0
- data/lib/rfilma/crawler.rb +70 -0
- data/lib/rfilma/crawlerdb.rb +66 -0
- data/lib/rfilma/pelicula.rb +28 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7647d761f2d69f1209da458110a6aff8315e770a
|
4
|
+
data.tar.gz: 11503bae8df3a9ad5c102d12ebed554278b4406e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a384e53fb27a1d3127a3ddb2e6879ad0bcdeebf66dc3c6bda41884f81273ce2ac7bce0c3317fccde60d519847c4745f7b779df707ff589689e5d3f3a588cb654
|
7
|
+
data.tar.gz: 146855783ab0c18329b0508ecfee91f9d8282cbe25f69f1d6f4c378c3ca24ef45f991a8591b77b010c3d4715d0996406398bfe2d6b974d6f040e341898c80d62
|
data/lib/rfilma.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require "rfilma/crawler"
|
2
|
+
require "rfilma/crawlerdb"
|
3
|
+
require "rfilma/pelicula"
|
4
|
+
|
5
|
+
class RFilma
|
6
|
+
|
7
|
+
attr_accessor :crawler, :crawlerdb
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@crawler = Crawler.new
|
11
|
+
@crawlerdb = CrawlerDB.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def buscar_por_titulo(titulo,web=false)
|
15
|
+
if web
|
16
|
+
result = @crawler.buscar_por_titulo(titulo)
|
17
|
+
result.each{|a| @crawlerdb.guardar_pelicula(a["id"])}
|
18
|
+
else
|
19
|
+
result = @crawlerdb.buscar_por_titulo(titulo)
|
20
|
+
end
|
21
|
+
return result
|
22
|
+
end
|
23
|
+
|
24
|
+
# Entrada: 1->(A-Z) 2->(0-9) 3->(*)
|
25
|
+
def actualizar_por_letra(caracter)
|
26
|
+
pelis = []
|
27
|
+
if caracter.upcase.match(/([A-Z])/)
|
28
|
+
pelis = @crawlerdb.procesar_paginas(caracter.upcase.match(/([A-Z])/)[1])
|
29
|
+
elsif caracter.match(/([0-9])/)
|
30
|
+
pelis = @crawlerdb.procesar_paginas("0-9")
|
31
|
+
else
|
32
|
+
pelis = @crawlerdb.procesar_paginas("*")
|
33
|
+
end
|
34
|
+
@crawlerdb.guardar_peliculas(pelis)
|
35
|
+
end
|
36
|
+
|
37
|
+
def actualizar_todo
|
38
|
+
@crawlerdb.procesar_todo
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require "mechanize"
|
2
|
+
require "set"
|
3
|
+
require "thread/pool"
|
4
|
+
|
5
|
+
class Crawler
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@a = Mechanize.new{|op|
|
9
|
+
op.user_agent_alias = "Windows Mozilla"
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
def obtener_pelicula(id)
|
15
|
+
data = {}
|
16
|
+
page = @a.get("http://www.filmaffinity.com/es/film#{id}.html").body
|
17
|
+
doc = Nokogiri::HTML(page)
|
18
|
+
data["id"] = id
|
19
|
+
data["titulo"] = doc.xpath("//h1[@id='main-title']/a/span").inner_html
|
20
|
+
data["puntuacion"] = doc.xpath('//div[@id="movie-rat-avg"]').text.strip.gsub(",",".").to_f
|
21
|
+
begin
|
22
|
+
data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/a')[0]["href"]
|
23
|
+
rescue
|
24
|
+
data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/img')[0]["src"]
|
25
|
+
end
|
26
|
+
doc.xpath('//dl[@class="movie-info"]/dt').each{|m|
|
27
|
+
dt = m.inner_html
|
28
|
+
case
|
29
|
+
when dt.include?("Título original")
|
30
|
+
data["titulo_original"] = m.next_element.text
|
31
|
+
when dt.include?("Año")
|
32
|
+
data["año"] = m.next_element.text.to_i
|
33
|
+
when dt.include?("Duración")
|
34
|
+
data["duracion"] = m.next_element.text.match('(\d*)')[1].to_i
|
35
|
+
when dt.include?("País")
|
36
|
+
data["pais"] = m.next_element.at('img')['title']
|
37
|
+
when dt.include?("Director")
|
38
|
+
data["director"] = m.next_element.search('a').map{|e| e.inner_html.strip}
|
39
|
+
when dt.include?("Guión")
|
40
|
+
data["guion"] = m.next_element.text.split(",").map{|e|e.strip}
|
41
|
+
when dt.include?("Música")
|
42
|
+
data["musica"] = m.next_element.text.split(",").map{|e|e.strip}
|
43
|
+
when dt.include?("Fotografía")
|
44
|
+
data["fotografia"] = m.next_element.text.split(",").map{|e|e.strip}
|
45
|
+
when dt.include?("Reparto")
|
46
|
+
data["reparto"] = m.next_element.text.split(",").map{|e|e.strip}
|
47
|
+
when dt.include?("Productora")
|
48
|
+
data["productora"] = m.next_element.text
|
49
|
+
when dt.include?("Género")
|
50
|
+
data["genero"] = m.next_element.search('a').map{|e| e.inner_html}
|
51
|
+
when dt.include?("Web")
|
52
|
+
data["web"] = m.next_element.text
|
53
|
+
when dt.include?("Sinopsis")
|
54
|
+
data["sinopsis"] = m.next_element.text
|
55
|
+
end
|
56
|
+
}
|
57
|
+
data
|
58
|
+
end
|
59
|
+
|
60
|
+
def buscar_por_titulo(titulo)
|
61
|
+
indices_pelis = []
|
62
|
+
p = @a.get("http://www.filmaffinity.com/es/search.php?stext=#{titulo.strip.gsub(" ","+")}&stype=title").body
|
63
|
+
doc = Nokogiri::HTML(p)
|
64
|
+
doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
|
65
|
+
indices_pelis << mc["data-movie-id"].to_i
|
66
|
+
}
|
67
|
+
indices_pelis.map{|i| obtener_pelicula(i)}
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require_relative "pelicula"
|
2
|
+
require_relative "crawler"
|
3
|
+
|
4
|
+
class CrawlerDB < Crawler
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def obtener_pelicula(id)
|
11
|
+
Pelicula.where(id: id).as_json
|
12
|
+
end
|
13
|
+
|
14
|
+
def buscar_por_titulo(titulo)
|
15
|
+
Pelicula.where(titulo: /#{titulo}/i).as_json
|
16
|
+
end
|
17
|
+
|
18
|
+
def guardar_pelicula(id)
|
19
|
+
p = Crawler.new.obtener_pelicula(id)
|
20
|
+
m = Pelicula.new(p)
|
21
|
+
m.upsert
|
22
|
+
end
|
23
|
+
|
24
|
+
def guardar_peliculas(ids,nthread=5)
|
25
|
+
pool = Thread.pool(nthread)
|
26
|
+
ids2 = Pelicula.find(ids).each.map{|idd| idd["id"]}
|
27
|
+
ids3 = (ids - ids2) + (ids2 - ids)
|
28
|
+
ids3.each{|i|
|
29
|
+
pool.process{
|
30
|
+
guardar_pelicula(i)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
pool.shutdown
|
34
|
+
end
|
35
|
+
|
36
|
+
def procesar_paginas(letra)
|
37
|
+
pagina = 1
|
38
|
+
# Cualquier categoría tiene más de una página
|
39
|
+
r = ">>"
|
40
|
+
indices_pelis = []
|
41
|
+
while r.include?(">>")
|
42
|
+
p = @a.get("http://www.filmaffinity.com/es/allfilms_#{letra}_#{pagina}.html").body
|
43
|
+
doc = Nokogiri::HTML(p)
|
44
|
+
r = doc.xpath('//div[@class="pager"]/a[contains(text(),">>")]').text
|
45
|
+
doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
|
46
|
+
indices_pelis << mc["data-movie-id"].to_i
|
47
|
+
}
|
48
|
+
pagina+=1
|
49
|
+
end
|
50
|
+
# Evitamos indices duplicados
|
51
|
+
Set.new(indices_pelis).to_a
|
52
|
+
end
|
53
|
+
|
54
|
+
def procesar_todo
|
55
|
+
cat = ('A'..'Z').to_a << "*" << "0-9"
|
56
|
+
pool = Thread.pool(5)
|
57
|
+
cat.each{|c|
|
58
|
+
pool.process{
|
59
|
+
ra = procesar_paginas(c)
|
60
|
+
guardar_peliculas(ra)
|
61
|
+
}
|
62
|
+
}
|
63
|
+
pool.shutdown
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "mongoid"
|
2
|
+
|
3
|
+
mongoidyml = File.join(File.dirname(__FILE__),"..","config","mongoid.yml")
|
4
|
+
Mongoid.load!(mongoidyml,:production)
|
5
|
+
|
6
|
+
class Pelicula
|
7
|
+
|
8
|
+
include Mongoid::Document
|
9
|
+
|
10
|
+
field :id, type: Integer
|
11
|
+
field :titulo, type: String
|
12
|
+
field :titulo_original, type: String
|
13
|
+
field :año, type: Integer
|
14
|
+
field :duracion, type: Integer
|
15
|
+
field :pais, type: String
|
16
|
+
field :director, type: Array
|
17
|
+
field :guion, type: Array
|
18
|
+
field :musica, type: Array
|
19
|
+
field :fotografia, type: Array
|
20
|
+
field :reparto, type: Array
|
21
|
+
field :productora, type: String
|
22
|
+
field :genero, type: Array
|
23
|
+
field :sinopsis, type: String
|
24
|
+
field :puntuacion, type: Float
|
25
|
+
field :web, type: String
|
26
|
+
field :portada, type: String
|
27
|
+
field :_id, type: Integer, overwrite: true, default: ->{ id }
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rfilma
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jose Antonio PB
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-08 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: RFilma is a Ruby library for crawl data from FilmAffinity website
|
14
|
+
email: aztuzeca@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/config/mongoid.yml
|
20
|
+
- lib/rfilma.rb
|
21
|
+
- lib/rfilma/crawler.rb
|
22
|
+
- lib/rfilma/crawlerdb.rb
|
23
|
+
- lib/rfilma/pelicula.rb
|
24
|
+
homepage: https://github.com/aztuzeca/rfilma
|
25
|
+
licenses:
|
26
|
+
- MIT
|
27
|
+
metadata: {}
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 2.4.2
|
45
|
+
signing_key:
|
46
|
+
specification_version: 4
|
47
|
+
summary: FilmAffinity Crawler
|
48
|
+
test_files: []
|
49
|
+
has_rdoc:
|