dmm-crawler 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +12 -0
 - data/Gemfile.lock +4 -4
 - data/lib/dmm-crawler/attributes.rb +31 -26
 - data/lib/dmm-crawler/ranking.rb +2 -1
 - data/lib/dmm-crawler/version.rb +1 -1
 - data/spec/dmm-crawler/ranking_spec.rb +1 -1
 - metadata +1 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 0fbdb558032e848d11224c9030642f04dfee44bf
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: ccb2cc27b9aafe47ca5331ae092889345fd177f9
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 72f5cd5902904c4fe4100fbacf5374bc28070406112d5ca82a5abd909a5eb3c5d7882b77ca905ce60254c9a8b0daf56d74a7df7fc7c7ae77d54c631cad0e50d0
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 70f8c48093ab24a88c1923312db6c6e2131c2704c04f1d6d20ac603d90b7efa9df1b237313cdfb6be1357064aec89a1c15c4fe831112ae78ea8b4ef8a39be200
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    | 
         @@ -1,5 +1,17 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # Change logs
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
      
 3 
     | 
    
         
            +
            ## 0.2.1
         
     | 
| 
      
 4 
     | 
    
         
            +
            - Add an attribute for author.
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            ## 0.2.0
         
     | 
| 
      
 7 
     | 
    
         
            +
            - Make optional configuration settable for Mechanize.
         
     | 
| 
      
 8 
     | 
    
         
            +
            - Fix a bug where data-src is not found
         
     | 
| 
      
 9 
     | 
    
         
            +
            - Do not fix types of submedia.
         
     | 
| 
      
 10 
     | 
    
         
            +
            - Update `README.md`.
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            ### Breaking Changes
         
     | 
| 
      
 13 
     | 
    
         
            +
            - Do not use passed value of submedia type.
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
       3 
15 
     | 
    
         
             
            ## 0.1.5
         
     | 
| 
       4 
16 
     | 
    
         
             
            - Do not crawl columns related to description.
         
     | 
| 
       5 
17 
     | 
    
         | 
    
        data/Gemfile.lock
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            PATH
         
     | 
| 
       2 
2 
     | 
    
         
             
              remote: .
         
     | 
| 
       3 
3 
     | 
    
         
             
              specs:
         
     | 
| 
       4 
     | 
    
         
            -
                dmm-crawler (0.2. 
     | 
| 
      
 4 
     | 
    
         
            +
                dmm-crawler (0.2.1)
         
     | 
| 
       5 
5 
     | 
    
         
             
                  mechanize
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
            GEM
         
     | 
| 
         @@ -27,11 +27,11 @@ GEM 
     | 
|
| 
       27 
27 
     | 
    
         
             
                mime-types (3.1)
         
     | 
| 
       28 
28 
     | 
    
         
             
                  mime-types-data (~> 3.2015)
         
     | 
| 
       29 
29 
     | 
    
         
             
                mime-types-data (3.2016.0521)
         
     | 
| 
       30 
     | 
    
         
            -
                mini_portile2 (2. 
     | 
| 
      
 30 
     | 
    
         
            +
                mini_portile2 (2.3.0)
         
     | 
| 
       31 
31 
     | 
    
         
             
                net-http-digest_auth (1.4.1)
         
     | 
| 
       32 
32 
     | 
    
         
             
                net-http-persistent (2.9.4)
         
     | 
| 
       33 
     | 
    
         
            -
                nokogiri (1.8. 
     | 
| 
       34 
     | 
    
         
            -
                  mini_portile2 (~> 2. 
     | 
| 
      
 33 
     | 
    
         
            +
                nokogiri (1.8.1)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  mini_portile2 (~> 2.3.0)
         
     | 
| 
       35 
35 
     | 
    
         
             
                ntlm-http (0.1.1)
         
     | 
| 
       36 
36 
     | 
    
         
             
                parser (2.4.0.0)
         
     | 
| 
       37 
37 
     | 
    
         
             
                  ast (~> 2.2)
         
     | 
| 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module DMMCrawler
         
     | 
| 
       2 
2 
     | 
    
         
             
              class Attributes
         
     | 
| 
       3 
3 
     | 
    
         
             
                def initialize(url)
         
     | 
| 
       4 
     | 
    
         
            -
                  @ 
     | 
| 
      
 4 
     | 
    
         
            +
                  @page = Agent.instance.agent.get(url)
         
     | 
| 
       5 
5 
     | 
    
         
             
                end
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
                def to_a
         
     | 
| 
         @@ -10,6 +10,7 @@ module DMMCrawler 
     | 
|
| 
       10 
10 
     | 
    
         
             
                    title_link,
         
     | 
| 
       11 
11 
     | 
    
         
             
                    image_url,
         
     | 
| 
       12 
12 
     | 
    
         
             
                    submedia,
         
     | 
| 
      
 13 
     | 
    
         
            +
                    author,
         
     | 
| 
       13 
14 
     | 
    
         
             
                    informations,
         
     | 
| 
       14 
15 
     | 
    
         
             
                    tags
         
     | 
| 
       15 
16 
     | 
    
         
             
                  ]
         
     | 
| 
         @@ -19,16 +20,24 @@ module DMMCrawler 
     | 
|
| 
       19 
20 
     | 
    
         | 
| 
       20 
21 
     | 
    
         
             
                def title
         
     | 
| 
       21 
22 
     | 
    
         
             
                  if art_page?
         
     | 
| 
       22 
     | 
    
         
            -
                    @ 
     | 
| 
       23 
     | 
    
         
            -
                    @ 
     | 
| 
      
 23 
     | 
    
         
            +
                    @page.search('.productTitle__txt span').remove
         
     | 
| 
      
 24 
     | 
    
         
            +
                    @page.search('.productTitle__txt').text.strip
         
     | 
| 
       24 
25 
     | 
    
         
             
                  else
         
     | 
| 
       25 
     | 
    
         
            -
                    @ 
     | 
| 
      
 26 
     | 
    
         
            +
                    @page.search('.rank-name').first.text.strip
         
     | 
| 
      
 27 
     | 
    
         
            +
                  end
         
     | 
| 
      
 28 
     | 
    
         
            +
                end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                def title_link
         
     | 
| 
      
 31 
     | 
    
         
            +
                  if art_page?
         
     | 
| 
      
 32 
     | 
    
         
            +
                    @page.uri.to_s
         
     | 
| 
      
 33 
     | 
    
         
            +
                  else
         
     | 
| 
      
 34 
     | 
    
         
            +
                    File.join(BASE_URL, @page.search('.rank-name').first.search('a').first.attributes.first[1].value)
         
     | 
| 
       26 
35 
     | 
    
         
             
                  end
         
     | 
| 
       27 
36 
     | 
    
         
             
                end
         
     | 
| 
       28 
37 
     | 
    
         | 
| 
       29 
38 
     | 
    
         
             
                def image_url
         
     | 
| 
       30 
39 
     | 
    
         
             
                  if art_page?
         
     | 
| 
       31 
     | 
    
         
            -
                    attrs = @ 
     | 
| 
      
 40 
     | 
    
         
            +
                    attrs = @page.search('.productPreview__item img').last.attributes
         
     | 
| 
       32 
41 
     | 
    
         | 
| 
       33 
42 
     | 
    
         
             
                    if attrs['data-src']
         
     | 
| 
       34 
43 
     | 
    
         
             
                      attrs['data-src'].value
         
     | 
| 
         @@ -36,20 +45,12 @@ module DMMCrawler 
     | 
|
| 
       36 
45 
     | 
    
         
             
                      attrs['src'].value
         
     | 
| 
       37 
46 
     | 
    
         
             
                    end
         
     | 
| 
       38 
47 
     | 
    
         
             
                  else
         
     | 
| 
       39 
     | 
    
         
            -
                    @ 
     | 
| 
       40 
     | 
    
         
            -
                  end
         
     | 
| 
       41 
     | 
    
         
            -
                end
         
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
                def title_link
         
     | 
| 
       44 
     | 
    
         
            -
                  if art_page?
         
     | 
| 
       45 
     | 
    
         
            -
                    @element.uri.to_s
         
     | 
| 
       46 
     | 
    
         
            -
                  else
         
     | 
| 
       47 
     | 
    
         
            -
                    File.join(BASE_URL, @element.search('.rank-name').first.search('a').first.attributes.first[1].value)
         
     | 
| 
      
 48 
     | 
    
         
            +
                    @page.search('img').last.attributes['src'].value
         
     | 
| 
       48 
49 
     | 
    
         
             
                  end
         
     | 
| 
       49 
50 
     | 
    
         
             
                end
         
     | 
| 
       50 
51 
     | 
    
         | 
| 
       51 
52 
     | 
    
         
             
                def submedia
         
     | 
| 
       52 
     | 
    
         
            -
                  @ 
     | 
| 
      
 53 
     | 
    
         
            +
                  @page
         
     | 
| 
       53 
54 
     | 
    
         
             
                    .search('.productAttribute-listItem .c_icon_productGenre')
         
     | 
| 
       54 
55 
     | 
    
         
             
                    .first
         
     | 
| 
       55 
56 
     | 
    
         
             
                    .attributes['class']
         
     | 
| 
         @@ -58,9 +59,13 @@ module DMMCrawler 
     | 
|
| 
       58 
59 
     | 
    
         
             
                    .delete('-')
         
     | 
| 
       59 
60 
     | 
    
         
             
                end
         
     | 
| 
       60 
61 
     | 
    
         | 
| 
      
 62 
     | 
    
         
            +
                def author
         
     | 
| 
      
 63 
     | 
    
         
            +
                  @page.search('p.circleProductTitle__main').text.gsub('作品一覧', '')
         
     | 
| 
      
 64 
     | 
    
         
            +
                end
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
       61 
66 
     | 
    
         
             
                def informations
         
     | 
| 
       62 
     | 
    
         
            -
                  keys = extract_text(@ 
     | 
| 
       63 
     | 
    
         
            -
                  values = extract_text(@ 
     | 
| 
      
 67 
     | 
    
         
            +
                  keys = extract_text(@page.search('.m-productInformation .productInformation__item .informationList__ttl'))
         
     | 
| 
      
 68 
     | 
    
         
            +
                  values = extract_text(@page.search('.m-productInformation .productInformation__item .informationList__txt'))
         
     | 
| 
       64 
69 
     | 
    
         | 
| 
       65 
70 
     | 
    
         
             
                  information = keys.zip(values)
         
     | 
| 
       66 
71 
     | 
    
         
             
                  series = information.find { |array| array.first == 'シリーズ' }
         
     | 
| 
         @@ -73,22 +78,22 @@ module DMMCrawler 
     | 
|
| 
       73 
78 
     | 
    
         
             
                  information.map { |key, value| { key: key, value: value } }
         
     | 
| 
       74 
79 
     | 
    
         
             
                end
         
     | 
| 
       75 
80 
     | 
    
         | 
| 
       76 
     | 
    
         
            -
                def extract_text(elements)
         
     | 
| 
       77 
     | 
    
         
            -
                  elements
         
     | 
| 
       78 
     | 
    
         
            -
                    .select { |element| element.text.strip != 'ジャンル' }
         
     | 
| 
       79 
     | 
    
         
            -
                    .map { |element| element.children.text.strip }
         
     | 
| 
       80 
     | 
    
         
            -
                end
         
     | 
| 
       81 
     | 
    
         
            -
             
     | 
| 
       82 
81 
     | 
    
         
             
                def tags
         
     | 
| 
       83 
82 
     | 
    
         
             
                  if art_page?
         
     | 
| 
       84 
     | 
    
         
            -
                    @ 
     | 
| 
      
 83 
     | 
    
         
            +
                    @page.search('.genreTagList .genreTagList__item a').map { |e| e.text.strip }
         
     | 
| 
       85 
84 
     | 
    
         
             
                  else
         
     | 
| 
       86 
     | 
    
         
            -
                    @ 
     | 
| 
      
 85 
     | 
    
         
            +
                    @page.search('.rank-labelListItem').map { |e| e.search('a').text.strip }
         
     | 
| 
       87 
86 
     | 
    
         
             
                  end
         
     | 
| 
       88 
87 
     | 
    
         
             
                end
         
     | 
| 
       89 
88 
     | 
    
         | 
| 
      
 89 
     | 
    
         
            +
                def extract_text(elements)
         
     | 
| 
      
 90 
     | 
    
         
            +
                  elements
         
     | 
| 
      
 91 
     | 
    
         
            +
                    .select { |element| element.text.strip != 'ジャンル' }
         
     | 
| 
      
 92 
     | 
    
         
            +
                    .map { |element| element.children.text.strip }
         
     | 
| 
      
 93 
     | 
    
         
            +
                end
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
       90 
95 
     | 
    
         
             
                def art_page?
         
     | 
| 
       91 
     | 
    
         
            -
                  @ 
     | 
| 
      
 96 
     | 
    
         
            +
                  @page.search('.rank-name').empty?
         
     | 
| 
       92 
97 
     | 
    
         
             
                end
         
     | 
| 
       93 
98 
     | 
    
         
             
              end
         
     | 
| 
       94 
99 
     | 
    
         
             
            end
         
     | 
    
        data/lib/dmm-crawler/ranking.rb
    CHANGED
    
    | 
         @@ -14,12 +14,13 @@ module DMMCrawler 
     | 
|
| 
       14 
14 
     | 
    
         
             
                    Attributes.new(url).to_a
         
     | 
| 
       15 
15 
     | 
    
         
             
                  end
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
     | 
    
         
            -
                  arts.map.with_index(1) do |(title, title_link, image_url, submedia, informations, tags), rank|
         
     | 
| 
      
 17 
     | 
    
         
            +
                  arts.map.with_index(1) do |(title, title_link, image_url, submedia, author, informations, tags), rank|
         
     | 
| 
       18 
18 
     | 
    
         
             
                    {
         
     | 
| 
       19 
19 
     | 
    
         
             
                      title: "#{rank}位: #{title}",
         
     | 
| 
       20 
20 
     | 
    
         
             
                      title_link: title_link,
         
     | 
| 
       21 
21 
     | 
    
         
             
                      image_url: image_url,
         
     | 
| 
       22 
22 
     | 
    
         
             
                      submedia: submedia,
         
     | 
| 
      
 23 
     | 
    
         
            +
                      author: author,
         
     | 
| 
       23 
24 
     | 
    
         
             
                      informations: informations,
         
     | 
| 
       24 
25 
     | 
    
         
             
                      tags: tags
         
     | 
| 
       25 
26 
     | 
    
         
             
                    }
         
     | 
    
        data/lib/dmm-crawler/version.rb
    CHANGED
    
    
| 
         @@ -21,7 +21,7 @@ describe DMMCrawler::Ranking do 
     | 
|
| 
       21 
21 
     | 
    
         | 
| 
       22 
22 
     | 
    
         
             
                  let(:term) { '24' }
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
                  it { is_expected.to all(include(:title, :title_link, :image_url, :submedia, :informations, :tags)) }
         
     | 
| 
      
 24 
     | 
    
         
            +
                  it { is_expected.to all(include(:title, :title_link, :image_url, :submedia, :author, :informations, :tags)) }
         
     | 
| 
       25 
25 
     | 
    
         
             
                end
         
     | 
| 
       26 
26 
     | 
    
         | 
| 
       27 
27 
     | 
    
         
             
                context 'with not registered argument' do
         
     |