dmm-crawler 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Gemfile.lock +4 -4
- data/lib/dmm-crawler/attributes.rb +31 -26
- data/lib/dmm-crawler/ranking.rb +2 -1
- data/lib/dmm-crawler/version.rb +1 -1
- data/spec/dmm-crawler/ranking_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fbdb558032e848d11224c9030642f04dfee44bf
|
4
|
+
data.tar.gz: ccb2cc27b9aafe47ca5331ae092889345fd177f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72f5cd5902904c4fe4100fbacf5374bc28070406112d5ca82a5abd909a5eb3c5d7882b77ca905ce60254c9a8b0daf56d74a7df7fc7c7ae77d54c631cad0e50d0
|
7
|
+
data.tar.gz: 70f8c48093ab24a88c1923312db6c6e2131c2704c04f1d6d20ac603d90b7efa9df1b237313cdfb6be1357064aec89a1c15c4fe831112ae78ea8b4ef8a39be200
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# Change logs
|
2
2
|
|
3
|
+
## 0.2.1
|
4
|
+
- Add an attribute for author.
|
5
|
+
|
6
|
+
## 0.2.0
|
7
|
+
- Make optional configuration settable for Mechanize.
|
8
|
+
- Fix a bug where data-src is not found
|
9
|
+
- Do not fix types of submedia.
|
10
|
+
- Update `README.md`.
|
11
|
+
|
12
|
+
### Breaking Changes
|
13
|
+
- Do not use passed value of submedia type.
|
14
|
+
|
3
15
|
## 0.1.5
|
4
16
|
- Do not crawl columns related to description.
|
5
17
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
dmm-crawler (0.2.
|
4
|
+
dmm-crawler (0.2.1)
|
5
5
|
mechanize
|
6
6
|
|
7
7
|
GEM
|
@@ -27,11 +27,11 @@ GEM
|
|
27
27
|
mime-types (3.1)
|
28
28
|
mime-types-data (~> 3.2015)
|
29
29
|
mime-types-data (3.2016.0521)
|
30
|
-
mini_portile2 (2.
|
30
|
+
mini_portile2 (2.3.0)
|
31
31
|
net-http-digest_auth (1.4.1)
|
32
32
|
net-http-persistent (2.9.4)
|
33
|
-
nokogiri (1.8.
|
34
|
-
mini_portile2 (~> 2.
|
33
|
+
nokogiri (1.8.1)
|
34
|
+
mini_portile2 (~> 2.3.0)
|
35
35
|
ntlm-http (0.1.1)
|
36
36
|
parser (2.4.0.0)
|
37
37
|
ast (~> 2.2)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module DMMCrawler
|
2
2
|
class Attributes
|
3
3
|
def initialize(url)
|
4
|
-
@
|
4
|
+
@page = Agent.instance.agent.get(url)
|
5
5
|
end
|
6
6
|
|
7
7
|
def to_a
|
@@ -10,6 +10,7 @@ module DMMCrawler
|
|
10
10
|
title_link,
|
11
11
|
image_url,
|
12
12
|
submedia,
|
13
|
+
author,
|
13
14
|
informations,
|
14
15
|
tags
|
15
16
|
]
|
@@ -19,16 +20,24 @@ module DMMCrawler
|
|
19
20
|
|
20
21
|
def title
|
21
22
|
if art_page?
|
22
|
-
@
|
23
|
-
@
|
23
|
+
@page.search('.productTitle__txt span').remove
|
24
|
+
@page.search('.productTitle__txt').text.strip
|
24
25
|
else
|
25
|
-
@
|
26
|
+
@page.search('.rank-name').first.text.strip
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def title_link
|
31
|
+
if art_page?
|
32
|
+
@page.uri.to_s
|
33
|
+
else
|
34
|
+
File.join(BASE_URL, @page.search('.rank-name').first.search('a').first.attributes.first[1].value)
|
26
35
|
end
|
27
36
|
end
|
28
37
|
|
29
38
|
def image_url
|
30
39
|
if art_page?
|
31
|
-
attrs = @
|
40
|
+
attrs = @page.search('.productPreview__item img').last.attributes
|
32
41
|
|
33
42
|
if attrs['data-src']
|
34
43
|
attrs['data-src'].value
|
@@ -36,20 +45,12 @@ module DMMCrawler
|
|
36
45
|
attrs['src'].value
|
37
46
|
end
|
38
47
|
else
|
39
|
-
@
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def title_link
|
44
|
-
if art_page?
|
45
|
-
@element.uri.to_s
|
46
|
-
else
|
47
|
-
File.join(BASE_URL, @element.search('.rank-name').first.search('a').first.attributes.first[1].value)
|
48
|
+
@page.search('img').last.attributes['src'].value
|
48
49
|
end
|
49
50
|
end
|
50
51
|
|
51
52
|
def submedia
|
52
|
-
@
|
53
|
+
@page
|
53
54
|
.search('.productAttribute-listItem .c_icon_productGenre')
|
54
55
|
.first
|
55
56
|
.attributes['class']
|
@@ -58,9 +59,13 @@ module DMMCrawler
|
|
58
59
|
.delete('-')
|
59
60
|
end
|
60
61
|
|
62
|
+
def author
|
63
|
+
@page.search('p.circleProductTitle__main').text.gsub('作品一覧', '')
|
64
|
+
end
|
65
|
+
|
61
66
|
def informations
|
62
|
-
keys = extract_text(@
|
63
|
-
values = extract_text(@
|
67
|
+
keys = extract_text(@page.search('.m-productInformation .productInformation__item .informationList__ttl'))
|
68
|
+
values = extract_text(@page.search('.m-productInformation .productInformation__item .informationList__txt'))
|
64
69
|
|
65
70
|
information = keys.zip(values)
|
66
71
|
series = information.find { |array| array.first == 'シリーズ' }
|
@@ -73,22 +78,22 @@ module DMMCrawler
|
|
73
78
|
information.map { |key, value| { key: key, value: value } }
|
74
79
|
end
|
75
80
|
|
76
|
-
def extract_text(elements)
|
77
|
-
elements
|
78
|
-
.select { |element| element.text.strip != 'ジャンル' }
|
79
|
-
.map { |element| element.children.text.strip }
|
80
|
-
end
|
81
|
-
|
82
81
|
def tags
|
83
82
|
if art_page?
|
84
|
-
@
|
83
|
+
@page.search('.genreTagList .genreTagList__item a').map { |e| e.text.strip }
|
85
84
|
else
|
86
|
-
@
|
85
|
+
@page.search('.rank-labelListItem').map { |e| e.search('a').text.strip }
|
87
86
|
end
|
88
87
|
end
|
89
88
|
|
89
|
+
def extract_text(elements)
|
90
|
+
elements
|
91
|
+
.select { |element| element.text.strip != 'ジャンル' }
|
92
|
+
.map { |element| element.children.text.strip }
|
93
|
+
end
|
94
|
+
|
90
95
|
def art_page?
|
91
|
-
@
|
96
|
+
@page.search('.rank-name').empty?
|
92
97
|
end
|
93
98
|
end
|
94
99
|
end
|
data/lib/dmm-crawler/ranking.rb
CHANGED
@@ -14,12 +14,13 @@ module DMMCrawler
|
|
14
14
|
Attributes.new(url).to_a
|
15
15
|
end
|
16
16
|
|
17
|
-
arts.map.with_index(1) do |(title, title_link, image_url, submedia, informations, tags), rank|
|
17
|
+
arts.map.with_index(1) do |(title, title_link, image_url, submedia, author, informations, tags), rank|
|
18
18
|
{
|
19
19
|
title: "#{rank}位: #{title}",
|
20
20
|
title_link: title_link,
|
21
21
|
image_url: image_url,
|
22
22
|
submedia: submedia,
|
23
|
+
author: author,
|
23
24
|
informations: informations,
|
24
25
|
tags: tags
|
25
26
|
}
|
data/lib/dmm-crawler/version.rb
CHANGED
@@ -21,7 +21,7 @@ describe DMMCrawler::Ranking do
|
|
21
21
|
|
22
22
|
let(:term) { '24' }
|
23
23
|
|
24
|
-
it { is_expected.to all(include(:title, :title_link, :image_url, :submedia, :informations, :tags)) }
|
24
|
+
it { is_expected.to all(include(:title, :title_link, :image_url, :submedia, :author, :informations, :tags)) }
|
25
25
|
end
|
26
26
|
|
27
27
|
context 'with not registered argument' do
|