mediaarts_scraper 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c95b848f065f1110dcf0616253e774f9b4db26be
4
+ data.tar.gz: 86994c5d54fc3c8336ecc0cd2496b28267e2dcf3
5
+ SHA512:
6
+ metadata.gz: 952ce1b825b4e18fc8b6c4ab22b7a3135edcfcbb7e2b4333c4140dfbb3183e0c86db2804554f2927aa5ca8405ad4be1eb77c384f5957836058f83aa091385690
7
+ data.tar.gz: ddef66e45a80cc515c7a8564cdd0a552b34cbf0285d622d2984c20f222f2dbe4bd0c9e50b93c36baeb29ec46a5de2a54bc5b7e5c8020d959d39426fe422898d8
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /tools/sit/data/actual.json
10
+ /Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018 xmisao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # MediaartsScraper
2
+
3
+ Ruby scraper implementation for https://mediaarts-db.bunka.go.jp/
4
+
5
+ 文化庁のメディア芸術データベースをスクレイピングするGemライブラリです。
6
+
7
+ # Installation
8
+
9
+ ```bash
10
+ gem install 'mediaarts_scraper'
11
+ ```
12
+
13
+ # Usage
14
+
15
+ アニメシリーズIDを指定してスクレイピングする。
16
+
17
+ ```ruby
18
+ require "mediaarts_scraper"
19
+
20
+ # アニメシリーズID
21
+ ans_id = 14810 # 魔法少女まどか★マギカ
22
+
23
+ # ページオブジェクトを作る
24
+ anime_series_page = MediaartsScraper::Page::AnimeSeriesPage.from_ans_id(ans_id)
25
+
26
+ # データオブジェクトを得る
27
+ anime_series = anime_series_page.data
28
+
29
+ # データオブジェクトから取得できる属性を得る
30
+ # 属性の名前はサイト上で言語をEnglishにした場合の項目名に準じます
31
+ p anime_series.attributes #=> [:anime_series_id, :distribution_format, :anime_work_id, :title, :title_kana, :date_of_release, :date_of_end, :time_slot_or_duration, :number_of_broadc
32
+ asts, :number_of_episodes, :distributor, :broadcast_period, :production_company, :original_source, :director, :credits, :cast, :theme_songs, :episode_t
33
+ itles, :notes, :description, :story, :main_characters, :character_design, :characters, :music_or_sound_effects, :format_of_original_source, :eirin_no,
34
+ :rating_by_the_eirin, :copyright, :english_title_in_japan, :english_title_overseas, :hepburn_romanization_of_original_title, :romanization_of_original_
35
+ title, :alternative_titles, :related_works, :tag, :carrier_type_of_original_material, :information_sources, :episodes, :materials, :packages, :related_
36
+ series, :manga_work]
37
+
38
+ # データオブジェクトから属性の値を得る
39
+ p anime_series.title #=> "魔法少女まどか★マギカ"
40
+
41
+ # データオブジェクトをハッシュに変換する
42
+ p anime_series.to_hash #=> {"class"=>"MediaartsScraper::Data::AnimeSeries", :anime_series_id=>"ANS001019800", :distribution_format=>"TV", :anime_work_id=>"ANT001019800", :title=>
43
+ "魔法少女まどか★マギカ",...
44
+
45
+ # データオブジェクトをJSONに変換する
46
+ p anime_series.to_json #=> "{\"class\":\"MediaartsScraper::Data::AnimeSeries\",\"anime_series_id\":\"ANS001019800\",\"distribution_format\":\"TV\",\"anime_work_id\":\"ANT001019800\",\"title\":\"魔法少女まどか★マギカ\",...
47
+ ```
48
+
49
+ `examples`以下に使用例があります。
50
+
51
+ # Supported Pages
52
+
53
+ バージョン0.1はアニメーションのみの対応です。
54
+
55
+ |Path|Page title|Page Object|Data Object|
56
+ |:---|:---|:---|:---|
57
+ |`/mg/`|マンガ|-|-|
58
+ |`/an/anime_series/<ans_id>`|作品情報(シリーズ)|`AnimeSeriesPage`|`AnimeSeries`|
59
+ |`/an/anime_series/<ans_id>/episodes`|各話情報一覧|`AnimeEpisodesPage`|`AnimeEpisodes`|
60
+ |`/an/anime_series/<ans_id>/anime_packages/<anp_id>`|パッケージ情報|`AnimePackagesPage`|`AnimePackages`|
61
+ |`/gm/`|ゲーム|-|-|
62
+ |`/ma/`|メディアアート|-|-|
63
+
64
+ `-` means unsupported yet.
65
+
66
+ # Development
67
+
68
+ ## Debug
69
+
70
+ Output internal logs of mediaarts_scraper by the following environment variable.
71
+
72
+ ```
73
+ MEDIAARTS_SCRAPER_LOG_LEVEL=0
74
+ ```
75
+
76
+ ## Testing
77
+
78
+ Run system integration testing. (Network connection is required.)
79
+
80
+ ```
81
+ rake sit
82
+ ```
83
+
84
+ Update expectation of system integration testing.
85
+
86
+ ```
87
+ rake sit_update
88
+ ```
89
+
90
+ ## Contributing
91
+
92
+ Bug reports and pull requests are welcome on GitHub at https://github.com/xmisao/mediaarts_scraper
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
3
+
4
+ desc 'Run formatting by rufo'
5
+ task :rufo do
6
+ fork do
7
+ exec("rufo Gemfile lib bin examples tools")
8
+ end
9
+
10
+ Process.wait
11
+ end
12
+
13
+ desc 'Run system integration testing'
14
+ task :sit do
15
+ fork do
16
+ exec("ruby tools/sit/sit.rb")
17
+ end
18
+
19
+ Process.wait
20
+ end
21
+
22
+ desc 'Update expectation of system integration testing'
23
+ task :sit_update do
24
+ fork do
25
+ exec("ruby tools/sit/sit_update.rb")
26
+ end
27
+
28
+ Process.wait
29
+ end
@@ -0,0 +1,16 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeEpisode
3
+ include DataObject
4
+
5
+ attribute :episode_no, "Episode No."
6
+ attribute :episode_title, "Episode Title"
7
+ attribute :date_of_release, "Date of Release"
8
+ attribute :credits, "Credits"
9
+ attribute :cast, "Cast"
10
+ attribute :character, "Character"
11
+ attribute :story, "Story", true
12
+ attribute :main_mecha, "Main Mecha"
13
+ attribute :notes, "Notes"
14
+ attribute :information_sources, "Information Sources"
15
+ end
16
+ end
@@ -0,0 +1,8 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeEpisodes
3
+ include DataObject
4
+
5
+ attribute :title, "title"
6
+ attribute(:episodes, "story_table") { |episodes| episodes.map { |episode| AnimeEpisode.new(episode) } }
7
+ end
8
+ end
@@ -0,0 +1,34 @@
1
+ module MediaartsScraper::Data
2
+ class AnimePackages
3
+ include DataObject
4
+
5
+ attribute :package_id, "Package ID"
6
+ attribute :title, "Title"
7
+ attribute :title_kana, "Title-kana"
8
+ attribute :other_editions, "Other Editions"
9
+ attribute :volumes, "Volumes"
10
+ attribute :content_description, "Content Description"
11
+ attribute :number_of_discs, "Number of Discs / Duration"
12
+ attribute :series_title, "Series Title"
13
+ attribute :series_no, "Series No."
14
+ attribute :publisher_etc, "Publisher etc."
15
+ attribute :credits, "Credits"
16
+ attribute :publication_format, "Publication Format"
17
+ attribute :notes, "Notes"
18
+ attribute :size, "Size"
19
+ attribute :appendices, "Appendices"
20
+ attribute :content_specifications, "Content Specifications"
21
+ attribute :sales_no, "Sales No. (Model No./Stock No.)"
22
+ attribute :jan, "JAN (EAN/UPC)"
23
+ attribute :price, "Price"
24
+ attribute :date_of_publication, "Date of Publication"
25
+ attribute :language, "Language"
26
+ attribute :ratings, "Ratings"
27
+ attribute :place_of_publication, "Place of Publication"
28
+ attribute :national_bib_no_jpno, "National Bib. No. (JPNO)"
29
+ attribute :package_description, "Package Description"
30
+ attribute :tag, "Tag"
31
+
32
+ attribute(:physical_collection, "Physical Collection") { |physical_collection| AnimePackagesPhysicalCollection.new(physical_collection) }
33
+ end
34
+ end
@@ -0,0 +1,10 @@
1
+ module MediaartsScraper::Data
2
+ class AnimePackagesPhysicalCollection
3
+ include DataObject
4
+
5
+ attribute :package_ownership_information_id, "Package Ownership Information ID"
6
+ attribute :name_of_institution, "Name of Institution"
7
+ attribute :registration_no, "Registration No.(Institutional Collection Item ID)"
8
+ attribute :notes_from_istitution, "Notes from Institution"
9
+ end
10
+ end
@@ -0,0 +1,51 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeries
3
+ include DataObject
4
+
5
+ attribute :anime_series_id, "Anime Series ID"
6
+ attribute :distribution_format, "Distribution Format"
7
+ attribute :anime_work_id, "Anime Work ID"
8
+ attribute :title, "Title"
9
+ attribute :title_kana, "Title-kana"
10
+ attribute :date_of_release, "Date of Release"
11
+ attribute :date_of_end, "Date of End"
12
+ attribute :time_slot_or_duration, "Time Slot / Duration"
13
+ attribute :number_of_broadcasts, "Number of Broadcasts"
14
+ attribute :number_of_episodes, "Number of Episodes"
15
+ attribute :distributor, "Distributor"
16
+ attribute :broadcast_period, "Broadcast Period"
17
+ attribute :production_company, "Production Company"
18
+ attribute :original_source, "Original Source"
19
+ attribute :director, "Director"
20
+ attribute :credits, "Credits"
21
+ attribute :cast, "Cast"
22
+ attribute :theme_songs, "Theme Songs"
23
+ attribute :episode_titles, "Episode Titles"
24
+ attribute :notes, "Notes"
25
+ attribute :description, "Description", true
26
+ attribute :story, "Story", true
27
+ attribute :main_characters, "Main Characters", true
28
+ attribute :character_design, "Character Design"
29
+ attribute :characters, "Characters (Mecha)", true
30
+ attribute :music_or_sound_effects, "Music / Sound Effects"
31
+ attribute :format_of_original_source, "Format of Original Source"
32
+ attribute :eirin_no, "EIRIN No."
33
+ attribute :rating_by_the_eirin, "Rating (by the EIRIN)"
34
+ attribute :copyright, "Copyright"
35
+ attribute :english_title_in_japan, "English Title (In Japan)"
36
+ attribute :english_title_overseas, "English Title (Overseas)"
37
+ attribute :hepburn_romanization_of_original_title, "Hepburn Romanization of Original Title"
38
+ attribute :romanization_of_original_title, "Romanization of Original Title (w/Macron diacritics)"
39
+ attribute :alternative_titles, "Alternative Titles"
40
+ attribute :related_works, "Related Works"
41
+ attribute :tag, "Tag"
42
+ attribute :carrier_type_of_original_material, "Carrier Type of Original Material"
43
+ attribute :information_sources, "Information Sources"
44
+
45
+ attribute(:episodes, "Episodes") { |episodes| episodes.map { |episode| AnimeSeriesEpisode.new(episode) } }
46
+ attribute(:materials, "Materials") { |materials| materials.map { |material| AnimeSeriesMaterial.new(material) } }
47
+ attribute(:packages, "Packages") { |packages| packages.map { |package| AnimeSeriesPackage.new(package) } }
48
+ attribute(:related_series, "Related Series") { |related_serieses| related_serieses.map { |related_series| AnimeSeriesRelatedSeries.new(related_series) } }
49
+ attribute(:manga_work, "Manga Work") { |manga_works| manga_works.map { |manga_work| AnimeSeriesMangaWork.new(manga_work) } }
50
+ end
51
+ end
@@ -0,0 +1,10 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesEpisode
3
+ include DataObject
4
+
5
+ attribute :episode_no, "Episode No."
6
+ attribute :episode_title, "Episode Title"
7
+ attribute :date_of_release, "Date of Release"
8
+ attribute :notes, "Notes"
9
+ end
10
+ end
@@ -0,0 +1,14 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesMangaWork
3
+ include DataObject
4
+
5
+ attribute :title, "Title"
6
+ attribute :author, "Author"
7
+
8
+ attribute(:mmt_id, "href") { |href| self.to_mmt_id(href) }
9
+
10
+ def self.to_mmt_id(href)
11
+ /comic_works\/(\d+)/.match(href).to_a[1]&.to_i if href
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,10 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesMaterial
3
+ include DataObject
4
+
5
+ attribute :material_name, "Material Name"
6
+ attribute :category, "Category"
7
+ attribute :list_of_authers, "List of Authors"
8
+ attribute :access_to_this_material, "Access to this Material"
9
+ end
10
+ end
@@ -0,0 +1,23 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesPackage
3
+ include DataObject
4
+
5
+ attribute :title, "Title"
6
+ attribute :volumes, "Volumes"
7
+ attribute :other_editions, "Other Editions"
8
+ attribute :publication_format, "Publication Format"
9
+ attribute :date_of_publication, "Date of Publication"
10
+ attribute :access_to_this_material, "Access to this Material"
11
+
12
+ attribute(:ans_id, "href") { |href| self.to_ans_id(href) }
13
+ attribute(:anp_id, "href") { |href| self.to_anp_id(href) }
14
+
15
+ def self.to_ans_id(href)
16
+ /anime_series\/(\d+)/.match(href).to_a[1]&.to_i if href
17
+ end
18
+
19
+ def self.to_anp_id(href)
20
+ /anime_packages\/(\d+)/.match(href).to_a[1]&.to_i if href
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,20 @@
1
+ module MediaartsScraper::Data
2
+ class AnimeSeriesRelatedSeries
3
+ include DataObject
4
+
5
+ attribute :title, "Title"
6
+ attribute :distribution_format, "Distribution Format"
7
+ attribute :director, "Director"
8
+ attribute :production_company, "Production Company"
9
+ attribute :date_of_release, "Date of Release"
10
+ attribute :date_of_end, "Date of End"
11
+ attribute :number_of_broadcasts, "Number of Broadcasts"
12
+ attribute :number_of_episodes, "Number of Episodes"
13
+
14
+ attribute(:ans_id, "href") { |href| self.to_ans_id(href) }
15
+
16
+ def self.to_ans_id(href)
17
+ /anime_series\/(\d+)/.match(href).to_a[1]&.to_i if href
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,93 @@
1
+ module MediaartsScraper
2
+ module Data
3
+ module DataObject
4
+ NULL = Object.new.freeze
5
+
6
+ def self.included(klass)
7
+ klass.class_eval do
8
+ attr_reader :raw_data
9
+
10
+ def initialize(raw_data)
11
+ @raw_data = raw_data
12
+ @attributes = Hash.new(NULL)
13
+ end
14
+
15
+ def attributes
16
+ self.class.attributes
17
+ end
18
+
19
+ def attributes_without_copyrighted
20
+ self.class.attributes_without_copyrighted
21
+ end
22
+
23
+ def self.attribute(method, en, copyrighted = false)
24
+ attributes << method
25
+ attributes_without_copyrighted << method unless copyrighted
26
+
27
+ define_method(method) do
28
+ if @attributes[method] == NULL
29
+ if block_given?
30
+ @attributes[method] = yield @raw_data[en]
31
+ else
32
+ @attributes[method] = @raw_data[en]
33
+ end
34
+ end
35
+
36
+ @attributes[method]
37
+ end
38
+ end
39
+
40
+ def self.attributes
41
+ @_attributes_ = [] unless @_attributes_
42
+
43
+ @_attributes_
44
+ end
45
+
46
+ def self.attributes_without_copyrighted
47
+ @_attributes_without_copyrighted_ = [] unless @_attributes_without_copyrighted_
48
+
49
+ @_attributes_without_copyrighted_
50
+ end
51
+
52
+ def to_json(*options)
53
+ to_hash.to_json(*options)
54
+ end
55
+
56
+ def to_json_without_copyrighted(*options)
57
+ to_hash_without_copyrighted.to_json(*options)
58
+ end
59
+
60
+ def to_hash
61
+ to_hash0(attributes, :to_hash)
62
+ end
63
+
64
+ def to_hash_without_copyrighted
65
+ to_hash0(attributes_without_copyrighted, :to_hash_without_copyrighted)
66
+ end
67
+
68
+ private
69
+
70
+ def to_hash0(attributes, method)
71
+ {}.tap { |hash|
72
+ hash["class"] = self.class.name
73
+
74
+ attributes.each do |attr|
75
+ value = send(attr)
76
+
77
+ hash[attr] = if value.is_a?(Array)
78
+ value.map { |item| item.respond_to?(method) ? item.send(method) : item }
79
+ elsif value.is_a?(Hash)
80
+ value.transform_values { |item| item.respond_to(method) ? item.send(method) : item }
81
+ elsif value.respond_to?(method)
82
+ value.send(method)
83
+ else
84
+ value
85
+ end
86
+ end
87
+ }
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,18 @@
1
+ module MediaartsScraper
2
+ def self.logger=(logger)
3
+ @@logger = logger
4
+ end
5
+
6
+ def self.logger
7
+ @@logger
8
+ end
9
+
10
+ def self.init_logger
11
+ @@logger = Logger.new(STDOUT)
12
+ @@logger.progname = "MediaartsScraper"
13
+
14
+ @@logger.level = ENV["MEDIAARTS_SCRAPER_LOG_LEVEL"] ? ENV["MEDIAARTS_SCRAPER_LOG_LEVEL"].to_i : Logger::Severity::UNKNOWN
15
+ end
16
+
17
+ init_logger
18
+ end
@@ -0,0 +1,56 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class AnimeEpisodesPage < PageBase
4
+ data_class MediaartsScraper::Data::AnimeEpisodes
5
+
6
+ PATH = "/anime_episodes?"
7
+ OPTION = "?display_view=pc&locale=en"
8
+ PAGE_OPTION = "&page="
9
+
10
+ attr_accessor :ans_id
11
+
12
+ def self.from_ans_id(ans_id, page_num = 1)
13
+ self.new(url: generate_url(ans_id, page_num))
14
+ end
15
+
16
+ def self.from_url(url)
17
+ self.new(url: url)
18
+ end
19
+
20
+ def self.generate_url(ans_id, page_num)
21
+ [MediaartsScraper.base_url,
22
+ AnimeSeriesPage::PATH,
23
+ ans_id,
24
+ PATH,
25
+ OPTION,
26
+ PAGE_OPTION,
27
+ page_num].join
28
+ end
29
+
30
+ def next_page
31
+ @next_page ||= parse_next_page
32
+ end
33
+
34
+ private
35
+
36
+ def parse(html)
37
+ title = /\s(.+)$/.match(doc.at_xpath("//section[@class='storyBlock']/h1").text.strip).to_a[1]
38
+
39
+ story_table = parse_common_serial_rows_table(doc.at_xpath("//table[@class='storyTbl']"))
40
+
41
+ {
42
+ "title" => title,
43
+ "story_table" => story_table,
44
+ }
45
+ end
46
+
47
+ def parse_next_page
48
+ nav = doc.at_xpath("//nav[@class='pager']")
49
+
50
+ next_page_url = nav.at_xpath("//a[@rel='next']")&.attribute("href")&.value
51
+
52
+ next_page_url ? self.class.from_url(resolve_relative_url(url, next_page_url)) : nil
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,39 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class AnimePackagesPage < PageBase
4
+ data_class MediaartsScraper::Data::AnimePackages
5
+
6
+ PATH = "/anime_packages/"
7
+ OPTION = "?display_view=pc&locale=en"
8
+
9
+ attr_accessor :ans_id
10
+
11
+ def self.from_ans_anp_id(ans_id, anp_id)
12
+ self.new(url: generate_url(ans_id, anp_id))
13
+ end
14
+
15
+ def self.generate_url(ans_id, anp_id)
16
+ [MediaartsScraper.base_url,
17
+ AnimeSeriesPage::PATH,
18
+ ans_id,
19
+ PATH,
20
+ anp_id,
21
+ OPTION].join
22
+ end
23
+
24
+ private
25
+
26
+ def parse(html)
27
+ doc = Nokogiri::HTML.parse(html, nil, "utf8")
28
+
29
+ document_table = doc.at_xpath("//div[@class='main']/section/table[@class='documentTbl']")
30
+ r1 = parse_common_key_value_table(document_table)
31
+
32
+ document_table2 = doc.at_xpath("//div[@class='sub']/section/table[@class='documentTbl2']")
33
+ r2 = parse_common_key_value_table(document_table2)
34
+
35
+ r1.merge("Physical Collection" => r2)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,89 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class AnimeSeriesPage < PageBase
4
+ data_class MediaartsScraper::Data::AnimeSeries
5
+
6
+ PATH = "/an/anime_series/"
7
+ OPTION = "?display_view=pc&locale=en"
8
+
9
+ attr_accessor :ans_id
10
+
11
+ def self.from_ans_id(ans_id)
12
+ self.new(url: generate_url(ans_id))
13
+ end
14
+
15
+ def self.generate_url(ans_id)
16
+ [MediaartsScraper.base_url,
17
+ PATH,
18
+ ans_id,
19
+ OPTION].join
20
+ end
21
+
22
+ def episodes_page
23
+ @episodes_page ||= parse_episode_page
24
+ end
25
+
26
+ def packages_pages
27
+ @package_page ||= parse_packages_pages
28
+ end
29
+
30
+ private
31
+
32
+ def parse(html)
33
+ {}.tap do |result|
34
+ doc.xpath("//div[@class='main']/section/table").each do |table|
35
+ result.merge!(parse_common_key_value_table(table))
36
+ end
37
+
38
+ doc.xpath("//div[@class='sub']/section").each do |section|
39
+ result.merge!(parse_sub_section(section))
40
+ end
41
+
42
+ doc.xpath("//div[@class='moreContents']")&.each do |div|
43
+ result.merge!(parse_more_contents(div))
44
+ end
45
+ end
46
+ end
47
+
48
+ def parse_sub_section(section)
49
+ base_key = section.xpath("h3").first.children.first.text.strip
50
+
51
+ tables = section.xpath("table[@class='seriesTbl2']")
52
+
53
+ raise ParseError unless tables.count == 1
54
+
55
+ table = tables.first
56
+
57
+ {base_key => parse_common_serial_rows_table(table)}
58
+ end
59
+
60
+ def parse_more_contents(div)
61
+ base_key = div.xpath("h3").text.strip
62
+
63
+ table_items = div.xpath("section/table[@class='seriesTbl']").each_with_object({}) do |table, memo|
64
+ memo.merge!(parse_common_key_value_table(table))
65
+ end
66
+
67
+ {base_key => table_items}
68
+ end
69
+
70
+ def parse_episode_page
71
+ move_story_element = doc.at_xpath("//div[@class='sub']/section/p[@class='moveStory']/a")
72
+
73
+ return unless move_story_element
74
+
75
+ relative_url = move_story_element.attribute("href").value
76
+
77
+ AnimeEpisodesPage.from_url(resolve_relative_url(url, relative_url))
78
+ end
79
+
80
+ def parse_packages_pages
81
+ doc.xpath("//div[@class='sub']/section/table[@class='seriesTbl2']/tbody/tr/td[@class='i']/a").map do |package_anchor_element|
82
+ relative_url = package_anchor_element.attribute("href").value
83
+
84
+ AnimePackagesPage.from_url(resolve_relative_url(url, relative_url))
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,80 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ module CommonTableParser
4
+ KEY_SEPARATOR = "/"
5
+
6
+ def parse_common_key_value_table(table)
7
+ result = {}
8
+
9
+ table.xpath("tbody/tr").each do |tr|
10
+ ths = tr.xpath("th")
11
+ tds = tr.xpath("td")
12
+
13
+ if ths.count == tds.count
14
+ keys = ths.map(&:text).map(&:strip)
15
+
16
+ values = tds.each_with_index.map do |td, i|
17
+ if td.xpath("p").count == 1
18
+ dls = td.xpath("div/div/dl")
19
+
20
+ if dls.count > 0
21
+ dls.each do |dl|
22
+ dts = dl.xpath("dt").map(&:text).map(&:strip)
23
+ dds = dl.xpath("dd").map(&:text).map(&:strip)
24
+
25
+ if dts.count == dds.count
26
+ dts.each do |dt|
27
+ dds.each do |dd|
28
+ result[keys[i] + KEY_SEPARATOR + dt] = dd
29
+ end
30
+ end
31
+ else
32
+ raise ParseError
33
+ end
34
+ end
35
+ end
36
+
37
+ td.xpath("p").first.text.strip
38
+ else
39
+ td.text.strip
40
+ end
41
+ end
42
+
43
+ keys.each_with_index do |key, i|
44
+ result[key] = values[i]
45
+ end
46
+ else
47
+ raise ParseError
48
+ end
49
+ end
50
+
51
+ result
52
+ end
53
+
54
+ def parse_common_serial_rows_table(table)
55
+ header = table.xpath("thead/tr/th").map(&:text).map(&:strip)
56
+
57
+ table.xpath("tbody/tr").map do |tr|
58
+ tds = tr.xpath("td")
59
+
60
+ data = tds.map { |td|
61
+ child = td.child
62
+
63
+ if child
64
+ child.text.strip
65
+ else
66
+ td.text.strip
67
+ end
68
+ }
69
+
70
+ tr_result = Hash[*header.zip(data).flatten]
71
+
72
+ link_element = tds.detect { |td| td.xpath("a").first }
73
+ tr_result["href"] = link_element.xpath("a").first.attributes["href"].value if link_element
74
+
75
+ tr_result
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,9 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ module Helper
4
+ def resolve_relative_url(base_url, relative_url)
5
+ URI.join(base_url, relative_url).to_s
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,48 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class PageBase
4
+ include Helper
5
+ include CommonTableParser
6
+
7
+ def initialize(url:)
8
+ @original_url = url
9
+ end
10
+
11
+ def self.from_url(url)
12
+ self.new(url: url)
13
+ end
14
+
15
+ def self.data_class(data_class)
16
+ define_method("data_class") do
17
+ data_class
18
+ end
19
+ end
20
+
21
+ def data
22
+ @data ||= data_class.new(raw_data)
23
+ end
24
+
25
+ def html
26
+ @html ||= fetch(url)
27
+ end
28
+
29
+ def url
30
+ @original_url
31
+ end
32
+
33
+ def raw_data
34
+ @raw_data ||= parse(html)
35
+ end
36
+
37
+ def doc
38
+ @doc ||= Nokogiri::HTML.parse(html, nil, "utf8")
39
+ end
40
+
41
+ def fetch(url)
42
+ MediaartsScraper.logger.debug { {class: self.class.name, method: "fetch", url: url} }
43
+
44
+ open(url) { |f| f.read }
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,5 @@
1
+ module MediaartsScraper
2
+ module Page
3
+ class ParseError < StandardError; end
4
+ end
5
+ end
@@ -0,0 +1,3 @@
1
+ module MediaartsScraper
2
+ VERSION = "0.0.0"
3
+ end
@@ -0,0 +1,42 @@
1
+ require "nokogiri"
2
+
3
+ require "open-uri"
4
+ require "pp"
5
+ require "json"
6
+ require "logger"
7
+
8
+ require_relative "mediaarts_scraper/version"
9
+
10
+ require_relative "mediaarts_scraper/logger"
11
+
12
+ require_relative "mediaarts_scraper/data/data_object"
13
+ require_relative "mediaarts_scraper/data/anime_series"
14
+ require_relative "mediaarts_scraper/data/anime_series_episode"
15
+ require_relative "mediaarts_scraper/data/anime_series_material"
16
+ require_relative "mediaarts_scraper/data/anime_series_package"
17
+ require_relative "mediaarts_scraper/data/anime_series_related_series"
18
+ require_relative "mediaarts_scraper/data/anime_series_manga_work"
19
+ require_relative "mediaarts_scraper/data/anime_episode"
20
+ require_relative "mediaarts_scraper/data/anime_episodes"
21
+ require_relative "mediaarts_scraper/data/anime_packages"
22
+ require_relative "mediaarts_scraper/data/anime_packages_physical_collection"
23
+
24
+ require_relative "mediaarts_scraper/page/parse_error"
25
+ require_relative "mediaarts_scraper/page/helper"
26
+ require_relative "mediaarts_scraper/page/common_table_parser"
27
+ require_relative "mediaarts_scraper/page/page_base"
28
+ require_relative "mediaarts_scraper/page/anime_series_page"
29
+ require_relative "mediaarts_scraper/page/anime_episodes_page"
30
+ require_relative "mediaarts_scraper/page/anime_packages_page"
31
+
32
+ module MediaartsScraper
33
+ def self.base_url=(base_url)
34
+ @@base_url = base_url
35
+ end
36
+
37
+ def self.base_url
38
+ @@base_url
39
+ end
40
+
41
+ @@base_url = "https://mediaarts-db.bunka.go.jp"
42
+ end
@@ -0,0 +1,29 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "mediaarts_scraper/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "mediaarts_scraper"
7
+ spec.version = MediaartsScraper::VERSION
8
+ spec.authors = ["xmisao"]
9
+ spec.email = ["mail@xmisao.com"]
10
+
11
+ spec.summary = %q{Scraper for Media Art Database}
12
+ spec.description = %q{Ruby scraper implementation for https://mediaarts-db.bunka.go.jp/}
13
+ spec.homepage = "https://github.com/xmisao/mediaarts_scraper"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(tools|examples|bin)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.16"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rufo", ">= 0.3.1"
26
+ spec.add_development_dependency "pry", ">= 0.11.3"
27
+
28
+ spec.add_runtime_dependency "nokogiri", ">= 1.8.4"
29
+ end
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mediaarts_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - xmisao
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-09-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rufo
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.3.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.3.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 0.11.3
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 0.11.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 1.8.4
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 1.8.4
83
+ description: Ruby scraper implementation for https://mediaarts-db.bunka.go.jp/
84
+ email:
85
+ - mail@xmisao.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - Gemfile
92
+ - LICENSE
93
+ - README.md
94
+ - Rakefile
95
+ - lib/mediaarts_scraper.rb
96
+ - lib/mediaarts_scraper/data/anime_episode.rb
97
+ - lib/mediaarts_scraper/data/anime_episodes.rb
98
+ - lib/mediaarts_scraper/data/anime_packages.rb
99
+ - lib/mediaarts_scraper/data/anime_packages_physical_collection.rb
100
+ - lib/mediaarts_scraper/data/anime_series.rb
101
+ - lib/mediaarts_scraper/data/anime_series_episode.rb
102
+ - lib/mediaarts_scraper/data/anime_series_manga_work.rb
103
+ - lib/mediaarts_scraper/data/anime_series_material.rb
104
+ - lib/mediaarts_scraper/data/anime_series_package.rb
105
+ - lib/mediaarts_scraper/data/anime_series_related_series.rb
106
+ - lib/mediaarts_scraper/data/data_object.rb
107
+ - lib/mediaarts_scraper/logger.rb
108
+ - lib/mediaarts_scraper/page/anime_episodes_page.rb
109
+ - lib/mediaarts_scraper/page/anime_packages_page.rb
110
+ - lib/mediaarts_scraper/page/anime_series_page.rb
111
+ - lib/mediaarts_scraper/page/common_table_parser.rb
112
+ - lib/mediaarts_scraper/page/helper.rb
113
+ - lib/mediaarts_scraper/page/page_base.rb
114
+ - lib/mediaarts_scraper/page/parse_error.rb
115
+ - lib/mediaarts_scraper/version.rb
116
+ - mediaarts_scraper.gemspec
117
+ homepage: https://github.com/xmisao/mediaarts_scraper
118
+ licenses:
119
+ - MIT
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.5.2.1
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: Scraper for Media Art Database
141
+ test_files: []