mediaarts_scraper 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c95b848f065f1110dcf0616253e774f9b4db26be
4
+ data.tar.gz: 86994c5d54fc3c8336ecc0cd2496b28267e2dcf3
5
+ SHA512:
6
+ metadata.gz: 952ce1b825b4e18fc8b6c4ab22b7a3135edcfcbb7e2b4333c4140dfbb3183e0c86db2804554f2927aa5ca8405ad4be1eb77c384f5957836058f83aa091385690
7
+ data.tar.gz: ddef66e45a80cc515c7a8564cdd0a552b34cbf0285d622d2984c20f222f2dbe4bd0c9e50b93c36baeb29ec46a5de2a54bc5b7e5c8020d959d39426fe422898d8
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /tools/sit/data/actual.json
10
+ /Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018 xmisao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # MediaartsScraper
2
+
3
+ Ruby scraper implementation for https://mediaarts-db.bunka.go.jp/
4
+
5
+ 文化庁のメディア芸術データベースをスクレイピングするGemライブラリです。
6
+
7
+ # Installation
8
+
9
+ ```bash
10
+ gem install 'mediaarts_scraper'
11
+ ```
12
+
13
+ # Usage
14
+
15
+ アニメシリーズIDを指定してスクレイピングする。
16
+
17
+ ```ruby
18
+ require "mediaarts_scraper"
19
+
20
+ # アニメシリーズID
21
+ ans_id = 14810 # 魔法少女まどか★マギカ
22
+
23
+ # ページオブジェクトを作る
24
+ anime_series_page = MediaartsScraper::Page::AnimeSeriesPage.from_ans_id(ans_id)
25
+
26
+ # データオブジェクトを得る
27
+ anime_series = anime_series_page.data
28
+
29
+ # データオブジェクトから取得できる属性を得る
30
+ # 属性の名前はサイト上で言語をEnglishにした場合の項目名に準じます
31
+ p anime_series.attributes #=> [:anime_series_id, :distribution_format, :anime_work_id, :title, :title_kana, :date_of_release, :date_of_end, :time_slot_or_duration, :number_of_broadc
32
+ asts, :number_of_episodes, :distributor, :broadcast_period, :production_company, :original_source, :director, :credits, :cast, :theme_songs, :episode_t
33
+ itles, :notes, :description, :story, :main_characters, :character_design, :characters, :music_or_sound_effects, :format_of_original_source, :eirin_no,
34
+ :rating_by_the_eirin, :copyright, :english_title_in_japan, :english_title_overseas, :hepburn_romanization_of_original_title, :romanization_of_original_
35
+ title, :alternative_titles, :related_works, :tag, :carrier_type_of_original_material, :information_sources, :episodes, :materials, :packages, :related_
36
+ series, :manga_work]
37
+
38
+ # データオブジェクトから属性の値を得る
39
+ p anime_series.title #=> "魔法少女まどか★マギカ"
40
+
41
+ # データオブジェクトをハッシュに変換する
42
+ p anime_series.to_hash #=> {"class"=>"MediaartsScraper::Data::AnimeSeries", :anime_series_id=>"ANS001019800", :distribution_format=>"TV", :anime_work_id=>"ANT001019800", :title=>
43
+ "魔法少女まどか★マギカ",...
44
+
45
+ # データオブジェクトをJSONに変換する
46
+ p anime_series.to_json #=> "{\"class\":\"MediaartsScraper::Data::AnimeSeries\",\"anime_series_id\":\"ANS001019800\",\"distribution_format\":\"TV\",\"anime_work_id\":\"ANT001019800\",\"title\":\"魔法少女まどか★マギカ\",...
47
+ ```
48
+
49
+ `examples`以下に使用例があります。
50
+
51
+ # Supported Pages
52
+
53
+ バージョン0.1はアニメーションのみの対応です。
54
+
55
+ |Path|Page title|Page Object|Data Object|
56
+ |:---|:---|:---|:---|
57
+ |`/mg/`|マンガ|-|-|
58
+ |`/an/anime_series/<ans_id>`|作品情報(シリーズ)|`AnimeSeriesPage`|`AnimeSeries`|
59
+ |`/an/anime_series/<ans_id>/episodes`|各話情報一覧|`AnimeEpisodesPage`|`AnimeEpisodes`|
60
+ |`/an/anime_series/<ans_id>/anime_packages/<anp_id>`|パッケージ情報|`AnimePackagesPage`|`AnimePackages`|
61
+ |`/gm/`|ゲーム|-|-|
62
+ |`/ma/`|メディアアート|-|-|
63
+
64
+ `-` means unsupported yet.
65
+
66
+ # Development
67
+
68
+ ## Debug
69
+
70
+ Output internal logs of mediaarts_scraper by the following environment variable.
71
+
72
+ ```
73
+ MEDIAARTS_SCRAPER_LOG_LEVEL=0
74
+ ```
75
+
76
+ ## Testing
77
+
78
+ Run system integration testing. (Network connection is required.)
79
+
80
+ ```
81
+ rake sit
82
+ ```
83
+
84
+ Update expectation of system integration testing.
85
+
86
+ ```
87
+ rake sit_update
88
+ ```
89
+
90
+ ## Contributing
91
+
92
+ Bug reports and pull requests are welcome on GitHub at https://github.com/xmisao/mediaarts_scraper
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
3
+
4
+ desc 'Run formatting by rufo'
5
+ task :rufo do
6
+ fork do
7
+ exec("rufo Gemfile lib bin examples tools")
8
+ end
9
+
10
+ Process.wait
11
+ end
12
+
13
+ desc 'Run system integration testing'
14
+ task :sit do
15
+ fork do
16
+ exec("ruby tools/sit/sit.rb")
17
+ end
18
+
19
+ Process.wait
20
+ end
21
+
22
+ desc 'Update expectation of system integration testing'
23
+ task :sit_update do
24
+ fork do
25
+ exec("ruby tools/sit/sit_update.rb")
26
+ end
27
+
28
+ Process.wait
29
+ end
@@ -0,0 +1,16 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeEpisode
3
+ include DataObject
4
+
5
+ attribute :episode_no, "Episode No."
6
+ attribute :episode_title, "Episode Title"
7
+ attribute :date_of_release, "Date of Release"
8
+ attribute :credits, "Credits"
9
+ attribute :cast, "Cast"
10
+ attribute :character, "Character"
11
+ attribute :story, "Story", true
12
+ attribute :main_mecha, "Main Mecha"
13
+ attribute :notes, "Notes"
14
+ attribute :information_sources, "Information Sources"
15
+ end
16
+ end
@@ -0,0 +1,8 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeEpisodes
3
+ include DataObject
4
+
5
+ attribute :title, "title"
6
+ attribute(:episodes, "story_table") { |episodes| episodes.map { |episode| AnimeEpisode.new(episode) } }
7
+ end
8
+ end
@@ -0,0 +1,34 @@
1
+ module MediaartsScraper::Data
2
+ class AnimePackages
3
+ include DataObject
4
+
5
+ attribute :package_id, "Package ID"
6
+ attribute :title, "Title"
7
+ attribute :title_kana, "Title-kana"
8
+ attribute :other_editions, "Other Editions"
9
+ attribute :volumes, "Volumes"
10
+ attribute :content_description, "Content Description"
11
+ attribute :number_of_discs, "Number of Discs / Duration"
12
+ attribute :series_title, "Series Title"
13
+ attribute :series_no, "Series No."
14
+ attribute :publisher_etc, "Publisher etc."
15
+ attribute :credits, "Credits"
16
+ attribute :publication_format, "Publication Format"
17
+ attribute :notes, "Notes"
18
+ attribute :size, "Size"
19
+ attribute :appendices, "Appendices"
20
+ attribute :content_specifications, "Content Specifications"
21
+ attribute :sales_no, "Sales No. (Model No./Stock No.)"
22
+ attribute :jan, "JAN (EAN/UPC)"
23
+ attribute :price, "Price"
24
+ attribute :date_of_publication, "Date of Publication"
25
+ attribute :language, "Language"
26
+ attribute :ratings, "Ratings"
27
+ attribute :place_of_publication, "Place of Publication"
28
+ attribute :national_bib_no_jpno, "National Bib. No. (JPNO)"
29
+ attribute :package_description, "Package Description"
30
+ attribute :tag, "Tag"
31
+
32
+ attribute(:physical_collection, "Physical Collection") { |physical_collection| AnimePackagesPhysicalCollection.new(physical_collection) }
33
+ end
34
+ end
@@ -0,0 +1,10 @@
1
+ module MediaartsScraper::Data
2
+ class AnimePackagesPhysicalCollection
3
+ include DataObject
4
+
5
+ attribute :package_ownership_information_id, "Package Ownership Information ID"
6
+ attribute :name_of_institution, "Name of Institution"
7
+ attribute :registration_no, "Registration No.(Institutional Collection Item ID)"
8
+ attribute :notes_from_istitution, "Notes from Institution"
9
+ end
10
+ end
@@ -0,0 +1,51 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeries
3
+ include DataObject
4
+
5
+ attribute :anime_series_id, "Anime Series ID"
6
+ attribute :distribution_format, "Distribution Format"
7
+ attribute :anime_work_id, "Anime Work ID"
8
+ attribute :title, "Title"
9
+ attribute :title_kana, "Title-kana"
10
+ attribute :date_of_release, "Date of Release"
11
+ attribute :date_of_end, "Date of End"
12
+ attribute :time_slot_or_duration, "Time Slot / Duration"
13
+ attribute :number_of_broadcasts, "Number of Broadcasts"
14
+ attribute :number_of_episodes, "Number of Episodes"
15
+ attribute :distributor, "Distributor"
16
+ attribute :broadcast_period, "Broadcast Period"
17
+ attribute :production_company, "Production Company"
18
+ attribute :original_source, "Original Source"
19
+ attribute :director, "Director"
20
+ attribute :credits, "Credits"
21
+ attribute :cast, "Cast"
22
+ attribute :theme_songs, "Theme Songs"
23
+ attribute :episode_titles, "Episode Titles"
24
+ attribute :notes, "Notes"
25
+ attribute :description, "Description", true
26
+ attribute :story, "Story", true
27
+ attribute :main_characters, "Main Characters", true
28
+ attribute :character_design, "Character Design"
29
+ attribute :characters, "Characters (Mecha)", true
30
+ attribute :music_or_sound_effects, "Music / Sound Effects"
31
+ attribute :format_of_original_source, "Format of Original Source"
32
+ attribute :eirin_no, "EIRIN No."
33
+ attribute :rating_by_the_eirin, "Rating (by the EIRIN)"
34
+ attribute :copyright, "Copyright"
35
+ attribute :english_title_in_japan, "English Title (In Japan)"
36
+ attribute :english_title_overseas, "English Title (Overseas)"
37
+ attribute :hepburn_romanization_of_original_title, "Hepburn Romanization of Original Title"
38
+ attribute :romanization_of_original_title, "Romanization of Original Title (w/Macron diacritics)"
39
+ attribute :alternative_titles, "Alternative Titles"
40
+ attribute :related_works, "Related Works"
41
+ attribute :tag, "Tag"
42
+ attribute :carrier_type_of_original_material, "Carrier Type of Original Material"
43
+ attribute :information_sources, "Information Sources"
44
+
45
+ attribute(:episodes, "Episodes") { |episodes| episodes.map { |episode| AnimeSeriesEpisode.new(episode) } }
46
+ attribute(:materials, "Materials") { |materials| materials.map { |material| AnimeSeriesMaterial.new(material) } }
47
+ attribute(:packages, "Packages") { |packages| packages.map { |package| AnimeSeriesPackage.new(package) } }
48
+ attribute(:related_series, "Related Series") { |related_serieses| related_serieses.map { |related_series| AnimeSeriesRelatedSeries.new(related_series) } }
49
+ attribute(:manga_work, "Manga Work") { |manga_works| manga_works.map { |manga_work| AnimeSeriesMangaWork.new(manga_work) } }
50
+ end
51
+ end
@@ -0,0 +1,10 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesEpisode
3
+ include DataObject
4
+
5
+ attribute :episode_no, "Episode No."
6
+ attribute :episode_title, "Episode Title"
7
+ attribute :date_of_release, "Date of Release"
8
+ attribute :notes, "Notes"
9
+ end
10
+ end
@@ -0,0 +1,14 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesMangaWork
3
+ include DataObject
4
+
5
+ attribute :title, "Title"
6
+ attribute :author, "Author"
7
+
8
+ attribute(:mmt_id, "href") { |href| self.to_mmt_id(href) }
9
+
10
+ def self.to_mmt_id(href)
11
+ /comic_works\/(\d+)/.match(href).to_a[1]&.to_i if href
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,10 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesMaterial
3
+ include DataObject
4
+
5
+ attribute :material_name, "Material Name"
6
+ attribute :category, "Category"
7
+ attribute :list_of_authers, "List of Authors"
8
+ attribute :access_to_this_material, "Access to this Material"
9
+ end
10
+ end
@@ -0,0 +1,23 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesPackage
3
+ include DataObject
4
+
5
+ attribute :title, "Title"
6
+ attribute :volumes, "Volumes"
7
+ attribute :other_editions, "Other Editions"
8
+ attribute :publication_format, "Publication Format"
9
+ attribute :date_of_publication, "Date of Publication"
10
+ attribute :access_to_this_material, "Access to this Material"
11
+
12
+ attribute(:ans_id, "href") { |href| self.to_ans_id(href) }
13
+ attribute(:anp_id, "href") { |href| self.to_anp_id(href) }
14
+
15
+ def self.to_ans_id(href)
16
+ /anime_series\/(\d+)/.match(href).to_a[1]&.to_i if href
17
+ end
18
+
19
+ def self.to_anp_id(href)
20
+ /anime_packages\/(\d+)/.match(href).to_a[1]&.to_i if href
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,20 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesRelatedSeries
3
+ include DataObject
4
+
5
+ attribute :title, "Title"
6
+ attribute :distribution_format, "Distribution Format"
7
+ attribute :director, "Director"
8
+ attribute :production_company, "Production Company"
9
+ attribute :date_of_release, "Date of Release"
10
+ attribute :date_of_end, "Date of End"
11
+ attribute :number_of_broadcasts, "Number of Broadcasts"
12
+ attribute :number_of_episodes, "Number of Episodes"
13
+
14
+ attribute(:ans_id, "href") { |href| self.to_ans_id(href) }
15
+
16
+ def self.to_ans_id(href)
17
+ /anime_series\/(\d+)/.match(href).to_a[1]&.to_i if href
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,93 @@
1
+ module MediaartsScraper
2
+ module Data
3
+ module DataObject
4
+ NULL = Object.new.freeze
5
+
6
+ def self.included(klass)
7
+ klass.class_eval do
8
+ attr_reader :raw_data
9
+
10
+ def initialize(raw_data)
11
+ @raw_data = raw_data
12
+ @attributes = Hash.new(NULL)
13
+ end
14
+
15
+ def attributes
16
+ self.class.attributes
17
+ end
18
+
19
+ def attributes_without_copyrighted
20
+ self.class.attributes_without_copyrighted
21
+ end
22
+
23
+ def self.attribute(method, en, copyrighted = false)
24
+ attributes << method
25
+ attributes_without_copyrighted << method unless copyrighted
26
+
27
+ define_method(method) do
28
+ if @attributes[method] == NULL
29
+ if block_given?
30
+ @attributes[method] = yield @raw_data[en]
31
+ else
32
+ @attributes[method] = @raw_data[en]
33
+ end
34
+ end
35
+
36
+ @attributes[method]
37
+ end
38
+ end
39
+
40
+ def self.attributes
41
+ @_attributes_ = [] unless @_attributes_
42
+
43
+ @_attributes_
44
+ end
45
+
46
+ def self.attributes_without_copyrighted
47
+ @_attributes_without_copyrighted_ = [] unless @_attributes_without_copyrighted_
48
+
49
+ @_attributes_without_copyrighted_
50
+ end
51
+
52
+ def to_json(*options)
53
+ to_hash.to_json(*options)
54
+ end
55
+
56
+ def to_json_without_copyrighted(*options)
57
+ to_hash_without_copyrighted.to_json(*options)
58
+ end
59
+
60
+ def to_hash
61
+ to_hash0(attributes, :to_hash)
62
+ end
63
+
64
+ def to_hash_without_copyrighted
65
+ to_hash0(attributes_without_copyrighted, :to_hash_without_copyrighted)
66
+ end
67
+
68
+ private
69
+
70
+ def to_hash0(attributes, method)
71
+ {}.tap { |hash|
72
+ hash["class"] = self.class.name
73
+
74
+ attributes.each do |attr|
75
+ value = send(attr)
76
+
77
+ hash[attr] = if value.is_a?(Array)
78
+ value.map { |item| item.respond_to?(method) ? item.send(method) : item }
79
+ elsif value.is_a?(Hash)
80
+ value.transform_values { |item| item.respond_to(method) ? item.send(method) : item }
81
+ elsif value.respond_to?(method)
82
+ value.send(method)
83
+ else
84
+ value
85
+ end
86
+ end
87
+ }
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,18 @@
1
+ module MediaartsScraper
2
+ def self.logger=(logger)
3
+ @@logger = logger
4
+ end
5
+
6
+ def self.logger
7
+ @@logger
8
+ end
9
+
10
+ def self.init_logger
11
+ @@logger = Logger.new(STDOUT)
12
+ @@logger.progname = "MediaartsScraper"
13
+
14
+ @@logger.level = ENV["MEDIAARTS_SCRAPER_LOG_LEVEL"] ? ENV["MEDIAARTS_SCRAPER_LOG_LEVEL"].to_i : Logger::Severity::UNKNOWN
15
+ end
16
+
17
+ init_logger
18
+ end
@@ -0,0 +1,56 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class AnimeEpisodesPage < PageBase
4
+ data_class MediaartsScraper::Data::AnimeEpisodes
5
+
6
+ PATH = "/anime_episodes?"
7
+ OPTION = "?display_view=pc&locale=en"
8
+ PAGE_OPTION = "&page="
9
+
10
+ attr_accessor :ans_id
11
+
12
+ def self.from_ans_id(ans_id, page_num = 1)
13
+ self.new(url: generate_url(ans_id, page_num))
14
+ end
15
+
16
+ def self.from_url(url)
17
+ self.new(url: url)
18
+ end
19
+
20
+ def self.generate_url(ans_id, page_num)
21
+ [MediaartsScraper.base_url,
22
+ AnimeSeriesPage::PATH,
23
+ ans_id,
24
+ PATH,
25
+ OPTION,
26
+ PAGE_OPTION,
27
+ page_num].join
28
+ end
29
+
30
+ def next_page
31
+ @next_page ||= parse_next_page
32
+ end
33
+
34
+ private
35
+
36
+ def parse(html)
37
+ title = /\s(.+)$/.match(doc.at_xpath("//section[@class='storyBlock']/h1").text.strip).to_a[1]
38
+
39
+ story_table = parse_common_serial_rows_table(doc.at_xpath("//table[@class='storyTbl']"))
40
+
41
+ {
42
+ "title" => title,
43
+ "story_table" => story_table,
44
+ }
45
+ end
46
+
47
+ def parse_next_page
48
+ nav = doc.at_xpath("//nav[@class='pager']")
49
+
50
+ next_page_url = nav.at_xpath("//a[@rel='next']")&.attribute("href")&.value
51
+
52
+ next_page_url ? self.class.from_url(resolve_relative_url(url, next_page_url)) : nil
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,39 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class AnimePackagesPage < PageBase
4
+ data_class MediaartsScraper::Data::AnimePackages
5
+
6
+ PATH = "/anime_packages/"
7
+ OPTION = "?display_view=pc&locale=en"
8
+
9
+ attr_accessor :ans_id
10
+
11
+ def self.from_ans_anp_id(ans_id, anp_id)
12
+ self.new(url: generate_url(ans_id, anp_id))
13
+ end
14
+
15
+ def self.generate_url(ans_id, anp_id)
16
+ [MediaartsScraper.base_url,
17
+ AnimeSeriesPage::PATH,
18
+ ans_id,
19
+ PATH,
20
+ anp_id,
21
+ OPTION].join
22
+ end
23
+
24
+ private
25
+
26
+ def parse(html)
27
+ doc = Nokogiri::HTML.parse(html, nil, "utf8")
28
+
29
+ document_table = doc.at_xpath("//div[@class='main']/section/table[@class='documentTbl']")
30
+ r1 = parse_common_key_value_table(document_table)
31
+
32
+ document_table2 = doc.at_xpath("//div[@class='sub']/section/table[@class='documentTbl2']")
33
+ r2 = parse_common_key_value_table(document_table2)
34
+
35
+ r1.merge("Physical Collection" => r2)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,89 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class AnimeSeriesPage < PageBase
4
+ data_class MediaartsScraper::Data::AnimeSeries
5
+
6
+ PATH = "/an/anime_series/"
7
+ OPTION = "?display_view=pc&locale=en"
8
+
9
+ attr_accessor :ans_id
10
+
11
+ def self.from_ans_id(ans_id)
12
+ self.new(url: generate_url(ans_id))
13
+ end
14
+
15
+ def self.generate_url(ans_id)
16
+ [MediaartsScraper.base_url,
17
+ PATH,
18
+ ans_id,
19
+ OPTION].join
20
+ end
21
+
22
+ def episodes_page
23
+ @episodes_page ||= parse_episode_page
24
+ end
25
+
26
+ def packages_pages
27
+ @package_page ||= parse_packages_pages
28
+ end
29
+
30
+ private
31
+
32
+ def parse(html)
33
+ {}.tap do |result|
34
+ doc.xpath("//div[@class='main']/section/table").each do |table|
35
+ result.merge!(parse_common_key_value_table(table))
36
+ end
37
+
38
+ doc.xpath("//div[@class='sub']/section").each do |section|
39
+ result.merge!(parse_sub_section(section))
40
+ end
41
+
42
+ doc.xpath("//div[@class='moreContents']")&.each do |div|
43
+ result.merge!(parse_more_contents(div))
44
+ end
45
+ end
46
+ end
47
+
48
+ def parse_sub_section(section)
49
+ base_key = section.xpath("h3").first.children.first.text.strip
50
+
51
+ tables = section.xpath("table[@class='seriesTbl2']")
52
+
53
+ raise ParseError unless tables.count == 1
54
+
55
+ table = tables.first
56
+
57
+ {base_key => parse_common_serial_rows_table(table)}
58
+ end
59
+
60
+ def parse_more_contents(div)
61
+ base_key = div.xpath("h3").text.strip
62
+
63
+ table_items = div.xpath("section/table[@class='seriesTbl']").each_with_object({}) do |table, memo|
64
+ memo.merge!(parse_common_key_value_table(table))
65
+ end
66
+
67
+ {base_key => table_items}
68
+ end
69
+
70
+ def parse_episode_page
71
+ move_story_element = doc.at_xpath("//div[@class='sub']/section/p[@class='moveStory']/a")
72
+
73
+ return unless move_story_element
74
+
75
+ relative_url = move_story_element.attribute("href").value
76
+
77
+ AnimeEpisodesPage.from_url(resolve_relative_url(url, relative_url))
78
+ end
79
+
80
+ def parse_packages_pages
81
+ doc.xpath("//div[@class='sub']/section/table[@class='seriesTbl2']/tbody/tr/td[@class='i']/a").map do |package_anchor_element|
82
+ relative_url = package_anchor_element.attribute("href").value
83
+
84
+ AnimePackagesPage.from_url(resolve_relative_url(url, relative_url))
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,80 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ module CommonTableParser
4
+ KEY_SEPARATOR = "/"
5
+
6
+ def parse_common_key_value_table(table)
7
+ result = {}
8
+
9
+ table.xpath("tbody/tr").each do |tr|
10
+ ths = tr.xpath("th")
11
+ tds = tr.xpath("td")
12
+
13
+ if ths.count == tds.count
14
+ keys = ths.map(&:text).map(&:strip)
15
+
16
+ values = tds.each_with_index.map do |td, i|
17
+ if td.xpath("p").count == 1
18
+ dls = td.xpath("div/div/dl")
19
+
20
+ if dls.count > 0
21
+ dls.each do |dl|
22
+ dts = dl.xpath("dt").map(&:text).map(&:strip)
23
+ dds = dl.xpath("dd").map(&:text).map(&:strip)
24
+
25
+ if dts.count == dds.count
26
+ dts.each do |dt|
27
+ dds.each do |dd|
28
+ result[keys[i] + KEY_SEPARATOR + dt] = dd
29
+ end
30
+ end
31
+ else
32
+ raise ParseError
33
+ end
34
+ end
35
+ end
36
+
37
+ td.xpath("p").first.text.strip
38
+ else
39
+ td.text.strip
40
+ end
41
+ end
42
+
43
+ keys.each_with_index do |key, i|
44
+ result[key] = values[i]
45
+ end
46
+ else
47
+ raise ParseError
48
+ end
49
+ end
50
+
51
+ result
52
+ end
53
+
54
+ def parse_common_serial_rows_table(table)
55
+ header = table.xpath("thead/tr/th").map(&:text).map(&:strip)
56
+
57
+ table.xpath("tbody/tr").map do |tr|
58
+ tds = tr.xpath("td")
59
+
60
+ data = tds.map { |td|
61
+ child = td.child
62
+
63
+ if child
64
+ child.text.strip
65
+ else
66
+ td.text.strip
67
+ end
68
+ }
69
+
70
+ tr_result = Hash[*header.zip(data).flatten]
71
+
72
+ link_element = tds.detect { |td| td.xpath("a").first }
73
+ tr_result["href"] = link_element.xpath("a").first.attributes["href"].value if link_element
74
+
75
+ tr_result
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,9 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ module Helper
4
+ def resolve_relative_url(base_url, relative_url)
5
+ URI.join(base_url, relative_url).to_s
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,48 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class PageBase
4
+ include Helper
5
+ include CommonTableParser
6
+
7
+ def initialize(url:)
8
+ @original_url = url
9
+ end
10
+
11
+ def self.from_url(url)
12
+ self.new(url: url)
13
+ end
14
+
15
+ def self.data_class(data_class)
16
+ define_method("data_class") do
17
+ data_class
18
+ end
19
+ end
20
+
21
+ def data
22
+ @data ||= data_class.new(raw_data)
23
+ end
24
+
25
+ def html
26
+ @html ||= fetch(url)
27
+ end
28
+
29
+ def url
30
+ @original_url
31
+ end
32
+
33
+ def raw_data
34
+ @raw_data ||= parse(html)
35
+ end
36
+
37
+ def doc
38
+ @doc ||= Nokogiri::HTML.parse(html, nil, "utf8")
39
+ end
40
+
41
+ def fetch(url)
42
+ MediaartsScraper.logger.debug { {class: self.class.name, method: "fetch", url: url} }
43
+
44
+ open(url) { |f| f.read }
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,5 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class ParseError < StandardError; end
4
+ end
5
+ end
@@ -0,0 +1,3 @@
1
+ module MediaartsScraper
2
+ VERSION = "0.0.0"
3
+ end
@@ -0,0 +1,42 @@
1
+ require "nokogiri"
2
+
3
+ require "open-uri"
4
+ require "pp"
5
+ require "json"
6
+ require "logger"
7
+
8
+ require_relative "mediaarts_scraper/version"
9
+
10
+ require_relative "mediaarts_scraper/logger"
11
+
12
+ require_relative "mediaarts_scraper/data/data_object"
13
+ require_relative "mediaarts_scraper/data/anime_series"
14
+ require_relative "mediaarts_scraper/data/anime_series_episode"
15
+ require_relative "mediaarts_scraper/data/anime_series_material"
16
+ require_relative "mediaarts_scraper/data/anime_series_package"
17
+ require_relative "mediaarts_scraper/data/anime_series_related_series"
18
+ require_relative "mediaarts_scraper/data/anime_series_manga_work"
19
+ require_relative "mediaarts_scraper/data/anime_episode"
20
+ require_relative "mediaarts_scraper/data/anime_episodes"
21
+ require_relative "mediaarts_scraper/data/anime_packages"
22
+ require_relative "mediaarts_scraper/data/anime_packages_physical_collection"
23
+
24
+ require_relative "mediaarts_scraper/page/parse_error"
25
+ require_relative "mediaarts_scraper/page/helper"
26
+ require_relative "mediaarts_scraper/page/common_table_parser"
27
+ require_relative "mediaarts_scraper/page/page_base"
28
+ require_relative "mediaarts_scraper/page/anime_series_page"
29
+ require_relative "mediaarts_scraper/page/anime_episodes_page"
30
+ require_relative "mediaarts_scraper/page/anime_packages_page"
31
+
32
+ module MediaartsScraper
33
+ def self.base_url=(base_url)
34
+ @@base_url = base_url
35
+ end
36
+
37
+ def self.base_url
38
+ @@base_url
39
+ end
40
+
41
+ @@base_url = "https://mediaarts-db.bunka.go.jp"
42
+ end
@@ -0,0 +1,29 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "mediaarts_scraper/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "mediaarts_scraper"
7
+ spec.version = MediaartsScraper::VERSION
8
+ spec.authors = ["xmisao"]
9
+ spec.email = ["mail@xmisao.com"]
10
+
11
+ spec.summary = %q{Scraper for Media Art Database}
12
+ spec.description = %q{Ruby scraper implementation for https://mediaarts-db.bunka.go.jp/}
13
+ spec.homepage = "https://github.com/xmisao/mediaarts_scraper"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(tools|examples|bin)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.16"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rufo", ">= 0.3.1"
26
+ spec.add_development_dependency "pry", ">= 0.11.3"
27
+
28
+ spec.add_runtime_dependency "nokogiri", ">= 1.8.4"
29
+ end
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mediaarts_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - xmisao
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-09-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rufo
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.3.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.3.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 0.11.3
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 0.11.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 1.8.4
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 1.8.4
83
+ description: Ruby scraper implementation for https://mediaarts-db.bunka.go.jp/
84
+ email:
85
+ - mail@xmisao.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - Gemfile
92
+ - LICENSE
93
+ - README.md
94
+ - Rakefile
95
+ - lib/mediaarts_scraper.rb
96
+ - lib/mediaarts_scraper/data/anime_episode.rb
97
+ - lib/mediaarts_scraper/data/anime_episodes.rb
98
+ - lib/mediaarts_scraper/data/anime_packages.rb
99
+ - lib/mediaarts_scraper/data/anime_packages_physical_collection.rb
100
+ - lib/mediaarts_scraper/data/anime_series.rb
101
+ - lib/mediaarts_scraper/data/anime_series_episode.rb
102
+ - lib/mediaarts_scraper/data/anime_series_manga_work.rb
103
+ - lib/mediaarts_scraper/data/anime_series_material.rb
104
+ - lib/mediaarts_scraper/data/anime_series_package.rb
105
+ - lib/mediaarts_scraper/data/anime_series_related_series.rb
106
+ - lib/mediaarts_scraper/data/data_object.rb
107
+ - lib/mediaarts_scraper/logger.rb
108
+ - lib/mediaarts_scraper/page/anime_episodes_page.rb
109
+ - lib/mediaarts_scraper/page/anime_packages_page.rb
110
+ - lib/mediaarts_scraper/page/anime_series_page.rb
111
+ - lib/mediaarts_scraper/page/common_table_parser.rb
112
+ - lib/mediaarts_scraper/page/helper.rb
113
+ - lib/mediaarts_scraper/page/page_base.rb
114
+ - lib/mediaarts_scraper/page/parse_error.rb
115
+ - lib/mediaarts_scraper/version.rb
116
+ - mediaarts_scraper.gemspec
117
+ homepage: https://github.com/xmisao/mediaarts_scraper
118
+ licenses:
119
+ - MIT
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.5.2.1
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: Scraper for Media Art Database
141
+ test_files: []