dmm-crawler 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Gemfile.lock +4 -4
- data/lib/dmm-crawler/attributes.rb +31 -26
- data/lib/dmm-crawler/ranking.rb +2 -1
- data/lib/dmm-crawler/version.rb +1 -1
- data/spec/dmm-crawler/ranking_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fbdb558032e848d11224c9030642f04dfee44bf
|
4
|
+
data.tar.gz: ccb2cc27b9aafe47ca5331ae092889345fd177f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72f5cd5902904c4fe4100fbacf5374bc28070406112d5ca82a5abd909a5eb3c5d7882b77ca905ce60254c9a8b0daf56d74a7df7fc7c7ae77d54c631cad0e50d0
|
7
|
+
data.tar.gz: 70f8c48093ab24a88c1923312db6c6e2131c2704c04f1d6d20ac603d90b7efa9df1b237313cdfb6be1357064aec89a1c15c4fe831112ae78ea8b4ef8a39be200
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# Change logs
|
2
2
|
|
3
|
+
## 0.2.1
|
4
|
+
- Add an attribute for author.
|
5
|
+
|
6
|
+
## 0.2.0
|
7
|
+
- Make optional configuration settable for Mechanize.
|
8
|
+
- Fix a bug where data-src is not found
|
9
|
+
- Do not fix types of submedia.
|
10
|
+
- Update `README.md`.
|
11
|
+
|
12
|
+
### Breaking Changes
|
13
|
+
- Do not use passed value of submedia type.
|
14
|
+
|
3
15
|
## 0.1.5
|
4
16
|
- Do not crawl columns related to description.
|
5
17
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
dmm-crawler (0.2.
|
4
|
+
dmm-crawler (0.2.1)
|
5
5
|
mechanize
|
6
6
|
|
7
7
|
GEM
|
@@ -27,11 +27,11 @@ GEM
|
|
27
27
|
mime-types (3.1)
|
28
28
|
mime-types-data (~> 3.2015)
|
29
29
|
mime-types-data (3.2016.0521)
|
30
|
-
mini_portile2 (2.
|
30
|
+
mini_portile2 (2.3.0)
|
31
31
|
net-http-digest_auth (1.4.1)
|
32
32
|
net-http-persistent (2.9.4)
|
33
|
-
nokogiri (1.8.
|
34
|
-
mini_portile2 (~> 2.
|
33
|
+
nokogiri (1.8.1)
|
34
|
+
mini_portile2 (~> 2.3.0)
|
35
35
|
ntlm-http (0.1.1)
|
36
36
|
parser (2.4.0.0)
|
37
37
|
ast (~> 2.2)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module DMMCrawler
|
2
2
|
class Attributes
|
3
3
|
def initialize(url)
|
4
|
-
@
|
4
|
+
@page = Agent.instance.agent.get(url)
|
5
5
|
end
|
6
6
|
|
7
7
|
def to_a
|
@@ -10,6 +10,7 @@ module DMMCrawler
|
|
10
10
|
title_link,
|
11
11
|
image_url,
|
12
12
|
submedia,
|
13
|
+
author,
|
13
14
|
informations,
|
14
15
|
tags
|
15
16
|
]
|
@@ -19,16 +20,24 @@ module DMMCrawler
|
|
19
20
|
|
20
21
|
def title
|
21
22
|
if art_page?
|
22
|
-
@
|
23
|
-
@
|
23
|
+
@page.search('.productTitle__txt span').remove
|
24
|
+
@page.search('.productTitle__txt').text.strip
|
24
25
|
else
|
25
|
-
@
|
26
|
+
@page.search('.rank-name').first.text.strip
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def title_link
|
31
|
+
if art_page?
|
32
|
+
@page.uri.to_s
|
33
|
+
else
|
34
|
+
File.join(BASE_URL, @page.search('.rank-name').first.search('a').first.attributes.first[1].value)
|
26
35
|
end
|
27
36
|
end
|
28
37
|
|
29
38
|
def image_url
|
30
39
|
if art_page?
|
31
|
-
attrs = @
|
40
|
+
attrs = @page.search('.productPreview__item img').last.attributes
|
32
41
|
|
33
42
|
if attrs['data-src']
|
34
43
|
attrs['data-src'].value
|
@@ -36,20 +45,12 @@ module DMMCrawler
|
|
36
45
|
attrs['src'].value
|
37
46
|
end
|
38
47
|
else
|
39
|
-
@
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def title_link
|
44
|
-
if art_page?
|
45
|
-
@element.uri.to_s
|
46
|
-
else
|
47
|
-
File.join(BASE_URL, @element.search('.rank-name').first.search('a').first.attributes.first[1].value)
|
48
|
+
@page.search('img').last.attributes['src'].value
|
48
49
|
end
|
49
50
|
end
|
50
51
|
|
51
52
|
def submedia
|
52
|
-
@
|
53
|
+
@page
|
53
54
|
.search('.productAttribute-listItem .c_icon_productGenre')
|
54
55
|
.first
|
55
56
|
.attributes['class']
|
@@ -58,9 +59,13 @@ module DMMCrawler
|
|
58
59
|
.delete('-')
|
59
60
|
end
|
60
61
|
|
62
|
+
def author
|
63
|
+
@page.search('p.circleProductTitle__main').text.gsub('作品一覧', '')
|
64
|
+
end
|
65
|
+
|
61
66
|
def informations
|
62
|
-
keys = extract_text(@
|
63
|
-
values = extract_text(@
|
67
|
+
keys = extract_text(@page.search('.m-productInformation .productInformation__item .informationList__ttl'))
|
68
|
+
values = extract_text(@page.search('.m-productInformation .productInformation__item .informationList__txt'))
|
64
69
|
|
65
70
|
information = keys.zip(values)
|
66
71
|
series = information.find { |array| array.first == 'シリーズ' }
|
@@ -73,22 +78,22 @@ module DMMCrawler
|
|
73
78
|
information.map { |key, value| { key: key, value: value } }
|
74
79
|
end
|
75
80
|
|
76
|
-
def extract_text(elements)
|
77
|
-
elements
|
78
|
-
.select { |element| element.text.strip != 'ジャンル' }
|
79
|
-
.map { |element| element.children.text.strip }
|
80
|
-
end
|
81
|
-
|
82
81
|
def tags
|
83
82
|
if art_page?
|
84
|
-
@
|
83
|
+
@page.search('.genreTagList .genreTagList__item a').map { |e| e.text.strip }
|
85
84
|
else
|
86
|
-
@
|
85
|
+
@page.search('.rank-labelListItem').map { |e| e.search('a').text.strip }
|
87
86
|
end
|
88
87
|
end
|
89
88
|
|
89
|
+
def extract_text(elements)
|
90
|
+
elements
|
91
|
+
.select { |element| element.text.strip != 'ジャンル' }
|
92
|
+
.map { |element| element.children.text.strip }
|
93
|
+
end
|
94
|
+
|
90
95
|
def art_page?
|
91
|
-
@
|
96
|
+
@page.search('.rank-name').empty?
|
92
97
|
end
|
93
98
|
end
|
94
99
|
end
|
data/lib/dmm-crawler/ranking.rb
CHANGED
@@ -14,12 +14,13 @@ module DMMCrawler
|
|
14
14
|
Attributes.new(url).to_a
|
15
15
|
end
|
16
16
|
|
17
|
-
arts.map.with_index(1) do |(title, title_link, image_url, submedia, informations, tags), rank|
|
17
|
+
arts.map.with_index(1) do |(title, title_link, image_url, submedia, author, informations, tags), rank|
|
18
18
|
{
|
19
19
|
title: "#{rank}位: #{title}",
|
20
20
|
title_link: title_link,
|
21
21
|
image_url: image_url,
|
22
22
|
submedia: submedia,
|
23
|
+
author: author,
|
23
24
|
informations: informations,
|
24
25
|
tags: tags
|
25
26
|
}
|
data/lib/dmm-crawler/version.rb
CHANGED
@@ -21,7 +21,7 @@ describe DMMCrawler::Ranking do
|
|
21
21
|
|
22
22
|
let(:term) { '24' }
|
23
23
|
|
24
|
-
it { is_expected.to all(include(:title, :title_link, :image_url, :submedia, :informations, :tags)) }
|
24
|
+
it { is_expected.to all(include(:title, :title_link, :image_url, :submedia, :author, :informations, :tags)) }
|
25
25
|
end
|
26
26
|
|
27
27
|
context 'with not registered argument' do
|