rfilma 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/config/mongoid.yml +8 -0
- data/lib/rfilma.rb +42 -0
- data/lib/rfilma/crawler.rb +70 -0
- data/lib/rfilma/crawlerdb.rb +66 -0
- data/lib/rfilma/pelicula.rb +28 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7647d761f2d69f1209da458110a6aff8315e770a
|
4
|
+
data.tar.gz: 11503bae8df3a9ad5c102d12ebed554278b4406e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a384e53fb27a1d3127a3ddb2e6879ad0bcdeebf66dc3c6bda41884f81273ce2ac7bce0c3317fccde60d519847c4745f7b779df707ff589689e5d3f3a588cb654
|
7
|
+
data.tar.gz: 146855783ab0c18329b0508ecfee91f9d8282cbe25f69f1d6f4c378c3ca24ef45f991a8591b77b010c3d4715d0996406398bfe2d6b974d6f040e341898c80d62
|
data/lib/rfilma.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require "rfilma/crawler"
|
2
|
+
require "rfilma/crawlerdb"
|
3
|
+
require "rfilma/pelicula"
|
4
|
+
|
5
|
+
class RFilma
|
6
|
+
|
7
|
+
attr_accessor :crawler, :crawlerdb
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@crawler = Crawler.new
|
11
|
+
@crawlerdb = CrawlerDB.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def buscar_por_titulo(titulo,web=false)
|
15
|
+
if web
|
16
|
+
result = @crawler.buscar_por_titulo(titulo)
|
17
|
+
result.each{|a| @crawlerdb.guardar_pelicula(a["id"])}
|
18
|
+
else
|
19
|
+
result = @crawlerdb.buscar_por_titulo(titulo)
|
20
|
+
end
|
21
|
+
return result
|
22
|
+
end
|
23
|
+
|
24
|
+
# Entrada: 1->(A-Z) 2->(0-9) 3->(*)
|
25
|
+
def actualizar_por_letra(caracter)
|
26
|
+
pelis = []
|
27
|
+
if caracter.upcase.match(/([A-Z])/)
|
28
|
+
pelis = @crawlerdb.procesar_paginas(caracter.upcase.match(/([A-Z])/)[1])
|
29
|
+
elsif caracter.match(/([0-9])/)
|
30
|
+
pelis = @crawlerdb.procesar_paginas("0-9")
|
31
|
+
else
|
32
|
+
pelis = @crawlerdb.procesar_paginas("*")
|
33
|
+
end
|
34
|
+
@crawlerdb.guardar_peliculas(pelis)
|
35
|
+
end
|
36
|
+
|
37
|
+
def actualizar_todo
|
38
|
+
@crawlerdb.procesar_todo
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require "mechanize"
|
2
|
+
require "set"
|
3
|
+
require "thread/pool"
|
4
|
+
|
5
|
+
class Crawler
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@a = Mechanize.new{|op|
|
9
|
+
op.user_agent_alias = "Windows Mozilla"
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
def obtener_pelicula(id)
|
15
|
+
data = {}
|
16
|
+
page = @a.get("http://www.filmaffinity.com/es/film#{id}.html").body
|
17
|
+
doc = Nokogiri::HTML(page)
|
18
|
+
data["id"] = id
|
19
|
+
data["titulo"] = doc.xpath("//h1[@id='main-title']/a/span").inner_html
|
20
|
+
data["puntuacion"] = doc.xpath('//div[@id="movie-rat-avg"]').text.strip.gsub(",",".").to_f
|
21
|
+
begin
|
22
|
+
data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/a')[0]["href"]
|
23
|
+
rescue
|
24
|
+
data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/img')[0]["src"]
|
25
|
+
end
|
26
|
+
doc.xpath('//dl[@class="movie-info"]/dt').each{|m|
|
27
|
+
dt = m.inner_html
|
28
|
+
case
|
29
|
+
when dt.include?("Título original")
|
30
|
+
data["titulo_original"] = m.next_element.text
|
31
|
+
when dt.include?("Año")
|
32
|
+
data["año"] = m.next_element.text.to_i
|
33
|
+
when dt.include?("Duración")
|
34
|
+
data["duracion"] = m.next_element.text.match('(\d*)')[1].to_i
|
35
|
+
when dt.include?("País")
|
36
|
+
data["pais"] = m.next_element.at('img')['title']
|
37
|
+
when dt.include?("Director")
|
38
|
+
data["director"] = m.next_element.search('a').map{|e| e.inner_html.strip}
|
39
|
+
when dt.include?("Guión")
|
40
|
+
data["guion"] = m.next_element.text.split(",").map{|e|e.strip}
|
41
|
+
when dt.include?("Música")
|
42
|
+
data["musica"] = m.next_element.text.split(",").map{|e|e.strip}
|
43
|
+
when dt.include?("Fotografía")
|
44
|
+
data["fotografia"] = m.next_element.text.split(",").map{|e|e.strip}
|
45
|
+
when dt.include?("Reparto")
|
46
|
+
data["reparto"] = m.next_element.text.split(",").map{|e|e.strip}
|
47
|
+
when dt.include?("Productora")
|
48
|
+
data["productora"] = m.next_element.text
|
49
|
+
when dt.include?("Género")
|
50
|
+
data["genero"] = m.next_element.search('a').map{|e| e.inner_html}
|
51
|
+
when dt.include?("Web")
|
52
|
+
data["web"] = m.next_element.text
|
53
|
+
when dt.include?("Sinopsis")
|
54
|
+
data["sinopsis"] = m.next_element.text
|
55
|
+
end
|
56
|
+
}
|
57
|
+
data
|
58
|
+
end
|
59
|
+
|
60
|
+
def buscar_por_titulo(titulo)
|
61
|
+
indices_pelis = []
|
62
|
+
p = @a.get("http://www.filmaffinity.com/es/search.php?stext=#{titulo.strip.gsub(" ","+")}&stype=title").body
|
63
|
+
doc = Nokogiri::HTML(p)
|
64
|
+
doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
|
65
|
+
indices_pelis << mc["data-movie-id"].to_i
|
66
|
+
}
|
67
|
+
indices_pelis.map{|i| obtener_pelicula(i)}
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require_relative "pelicula"
|
2
|
+
require_relative "crawler"
|
3
|
+
|
4
|
+
class CrawlerDB < Crawler
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def obtener_pelicula(id)
|
11
|
+
Pelicula.where(id: id).as_json
|
12
|
+
end
|
13
|
+
|
14
|
+
def buscar_por_titulo(titulo)
|
15
|
+
Pelicula.where(titulo: /#{titulo}/i).as_json
|
16
|
+
end
|
17
|
+
|
18
|
+
def guardar_pelicula(id)
|
19
|
+
p = Crawler.new.obtener_pelicula(id)
|
20
|
+
m = Pelicula.new(p)
|
21
|
+
m.upsert
|
22
|
+
end
|
23
|
+
|
24
|
+
def guardar_peliculas(ids,nthread=5)
|
25
|
+
pool = Thread.pool(nthread)
|
26
|
+
ids2 = Pelicula.find(ids).each.map{|idd| idd["id"]}
|
27
|
+
ids3 = (ids - ids2) + (ids2 - ids)
|
28
|
+
ids3.each{|i|
|
29
|
+
pool.process{
|
30
|
+
guardar_pelicula(i)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
pool.shutdown
|
34
|
+
end
|
35
|
+
|
36
|
+
def procesar_paginas(letra)
|
37
|
+
pagina = 1
|
38
|
+
# Cualquier categoría tiene más de una página
|
39
|
+
r = ">>"
|
40
|
+
indices_pelis = []
|
41
|
+
while r.include?(">>")
|
42
|
+
p = @a.get("http://www.filmaffinity.com/es/allfilms_#{letra}_#{pagina}.html").body
|
43
|
+
doc = Nokogiri::HTML(p)
|
44
|
+
r = doc.xpath('//div[@class="pager"]/a[contains(text(),">>")]').text
|
45
|
+
doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
|
46
|
+
indices_pelis << mc["data-movie-id"].to_i
|
47
|
+
}
|
48
|
+
pagina+=1
|
49
|
+
end
|
50
|
+
# Evitamos indices duplicados
|
51
|
+
Set.new(indices_pelis).to_a
|
52
|
+
end
|
53
|
+
|
54
|
+
def procesar_todo
|
55
|
+
cat = ('A'..'Z').to_a << "*" << "0-9"
|
56
|
+
pool = Thread.pool(5)
|
57
|
+
cat.each{|c|
|
58
|
+
pool.process{
|
59
|
+
ra = procesar_paginas(c)
|
60
|
+
guardar_peliculas(ra)
|
61
|
+
}
|
62
|
+
}
|
63
|
+
pool.shutdown
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "mongoid"
|
2
|
+
|
3
|
+
mongoidyml = File.join(File.dirname(__FILE__),"..","config","mongoid.yml")
|
4
|
+
Mongoid.load!(mongoidyml,:production)
|
5
|
+
|
6
|
+
class Pelicula
|
7
|
+
|
8
|
+
include Mongoid::Document
|
9
|
+
|
10
|
+
field :id, type: Integer
|
11
|
+
field :titulo, type: String
|
12
|
+
field :titulo_original, type: String
|
13
|
+
field :año, type: Integer
|
14
|
+
field :duracion, type: Integer
|
15
|
+
field :pais, type: String
|
16
|
+
field :director, type: Array
|
17
|
+
field :guion, type: Array
|
18
|
+
field :musica, type: Array
|
19
|
+
field :fotografia, type: Array
|
20
|
+
field :reparto, type: Array
|
21
|
+
field :productora, type: String
|
22
|
+
field :genero, type: Array
|
23
|
+
field :sinopsis, type: String
|
24
|
+
field :puntuacion, type: Float
|
25
|
+
field :web, type: String
|
26
|
+
field :portada, type: String
|
27
|
+
field :_id, type: Integer, overwrite: true, default: ->{ id }
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rfilma
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jose Antonio PB
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-08 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: RFilma is a Ruby library for crawl data from FilmAffinity website
|
14
|
+
email: aztuzeca@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/config/mongoid.yml
|
20
|
+
- lib/rfilma.rb
|
21
|
+
- lib/rfilma/crawler.rb
|
22
|
+
- lib/rfilma/crawlerdb.rb
|
23
|
+
- lib/rfilma/pelicula.rb
|
24
|
+
homepage: https://github.com/aztuzeca/rfilma
|
25
|
+
licenses:
|
26
|
+
- MIT
|
27
|
+
metadata: {}
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 2.4.2
|
45
|
+
signing_key:
|
46
|
+
specification_version: 4
|
47
|
+
summary: FilmAffinity Crawler
|
48
|
+
test_files: []
|
49
|
+
has_rdoc:
|