RubyGems - wonder_scrape - Versions diffs - 0.1.0 - Mend

wonder_scrape 0.1.0

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +11 -0
data/.rspec +3 -0
data/.ruby-version +1 -0
data/.travis.yml +6 -0
data/CHANGELOG.md +0 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +7 -0
data/Gemfile.lock +87 -0
data/LICENSE.txt +21 -0
data/README.md +47 -0
data/Rakefile +8 -0
data/bin/console +15 -0
data/bin/setup +8 -0
data/exe/wonder_scrape +19 -0
data/lib/wonder_scrape.rb +7 -0
data/lib/wonder_scrape/cli.rb +49 -0
data/lib/wonder_scrape/command.rb +41 -0
data/lib/wonder_scrape/commands/.gitkeep +1 -0
data/lib/wonder_scrape/commands/scrape.rb +93 -0
data/lib/wonder_scrape/recorder.rb +47 -0
data/lib/wonder_scrape/scrapers/mfc/field_parsers.rb +71 -0
data/lib/wonder_scrape/scrapers/mfc/item_parser.rb +146 -0
data/lib/wonder_scrape/scrapers/mfc/mfc.rb +5 -0
data/lib/wonder_scrape/scrapers/mfc/scraper.rb +72 -0
data/lib/wonder_scrape/scrapers/scrapers.rb +5 -0
data/lib/wonder_scrape/templates/.gitkeep +1 -0
data/lib/wonder_scrape/templates/scrape/.gitkeep +1 -0
data/lib/wonder_scrape/version.rb +5 -0
data/lib/wonder_scrape/writers/csv.rb +32 -0
data/lib/wonder_scrape/writers/hash.rb +22 -0
data/lib/wonder_scrape/writers/writers.rb +3 -0
data/wonder_scrape.gemspec +38 -0
metadata +150 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: c27721fddd799f4cb631710d07c090a17abdaef56be4b9e725ac15e95bacce36
+  data.tar.gz: fc6469515a2d78a505d0911c0d8698b39a56a2ba7d1774aaf53723bf533c4ef3
+SHA512:
+  metadata.gz: 23f4a73f08832f3ce85d06991ca879efb6a01c1592d53b8684d3a4a3cd8057c3e1335e8a4cb5f247c69026fa853df71e4d7d55c39f55be3cc14a27e60ffd549a
+  data.tar.gz: 8cc004a61c5a3f032c0f3a1e8e2e4028ce14c39ba04f62c77712ead4933418d3b7cdb08cf5fe324446c3f1d7626aaf9a1e2c6a67d2951d9e95ac018c2e90416f

data/.gitignore ADDED Viewed

@@ -0,0 +1,11 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.ruby-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 2.7.0

data/.travis.yml ADDED Viewed

@@ -0,0 +1,6 @@
+---
+language: ruby
+cache: bundler
+rvm:
+  - 2.7.0
+before_install: gem install bundler -v 2.1.4

data/CHANGELOG.md ADDED Viewed

File without changes

data/CODE_OF_CONDUCT.md ADDED Viewed

@@ -0,0 +1,74 @@
+# Contributor Covenant Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of experience,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at bendawson.rb@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at [https://contributor-covenant.org/version/1/4][version]
+[homepage]: https://contributor-covenant.org
+[version]: https://contributor-covenant.org/version/1/4/

data/Gemfile ADDED Viewed

@@ -0,0 +1,7 @@
+source "https://rubygems.org"
+# Specify your gem's dependencies in wonder_scrape.gemspec
+gemspec
+gem "rake", "~> 12.0"
+gem "rspec", "~> 3.0"

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,87 @@
+PATH
+  remote: .
+  specs:
+    wonder_scrape (0.1.0)
+      nokogiri (~> 1.10.9)
+      thor
+      tty-progressbar
+      tty-prompt
+      upton (~> 0.3.6)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.3)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    equatable (0.6.1)
+    http-accept (1.7.0)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    mime-types (3.3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2020.0425)
+    mini_portile2 (2.4.0)
+    necromancer (0.5.1)
+    netrc (0.11.0)
+    nokogiri (1.10.9)
+      mini_portile2 (~> 2.4.0)
+    pastel (0.7.3)
+      equatable (~> 0.6)
+      tty-color (~> 0.5)
+    rake (12.3.3)
+    rest-client (2.1.0)
+      http-accept (>= 1.7.0, < 2.0)
+      http-cookie (>= 1.0.2, < 2.0)
+      mime-types (>= 1.16, < 4.0)
+      netrc (~> 0.8)
+    rspec (3.9.0)
+      rspec-core (~> 3.9.0)
+      rspec-expectations (~> 3.9.0)
+      rspec-mocks (~> 3.9.0)
+    rspec-core (3.9.1)
+      rspec-support (~> 3.9.1)
+    rspec-expectations (3.9.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-mocks (3.9.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-support (3.9.2)
+    strings-ansi (0.1.0)
+    thor (1.0.1)
+    tty-color (0.5.1)
+    tty-cursor (0.7.1)
+    tty-progressbar (0.17.0)
+      strings-ansi (~> 0.1.0)
+      tty-cursor (~> 0.7)
+      tty-screen (~> 0.7)
+      unicode-display_width (~> 1.6)
+    tty-prompt (0.21.0)
+      necromancer (~> 0.5.0)
+      pastel (~> 0.7.0)
+      tty-reader (~> 0.7.0)
+    tty-reader (0.7.0)
+      tty-cursor (~> 0.7)
+      tty-screen (~> 0.7)
+      wisper (~> 2.0.0)
+    tty-screen (0.7.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.7)
+    unicode-display_width (1.7.0)
+    upton (0.3.6)
+      nokogiri (~> 1.5)
+      rest-client (~> 2.0, >= 1.6)
+    wisper (2.0.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  rake (~> 12.0)
+  rspec (~> 3.0)
+  wonder_scrape!
+BUNDLED WITH
+   2.1.4

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2020 Benjamin Dawson
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,47 @@
+# WonderScrape
+A project to collect useful information from figure collecting websites.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'wonder_scrape'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install wonder_scrape
+## Usage
+To get started, run:
+    $ wonder_scrape scrape
+For more configuration options, run:
+    # wonder_scrape help scrape
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/wonder_scrape. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/wonder_scrape/blob/master/CODE_OF_CONDUCT.md).
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
+## Code of Conduct
+Everyone interacting in the WonderScrape project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/wonder_scrape/blob/master/CODE_OF_CONDUCT.md).

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+# frozen_string_literal: true
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+require 'bundler/setup'
+require 'wonder_scrape'
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require 'irb'
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/exe/wonder_scrape ADDED Viewed

@@ -0,0 +1,19 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+lib_path = File.expand_path('../lib', __dir__)
+$LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
+require 'wonder_scrape'
+require 'wonder_scrape/cli'
+Signal.trap('INT') do
+  warn("\n#{caller.join("\n")}: interrupted")
+  exit(1)
+end
+begin
+  WonderScrape::CLI.start
+rescue WonderScrape::CLI::Error => e
+  puts "ERROR: #{e.message}"
+  exit 1
+end

data/lib/wonder_scrape.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+require 'wonder_scrape/version'
+module WonderScrape
+  class Error < StandardError; end
+end

data/lib/wonder_scrape/cli.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+require 'thor'
+require_relative 'commands/scrape'
+module WonderScrape
+  # Handle the application command line parsing
+  # and the dispatch to various command objects
+  #
+  # @api public
+  class CLI < Thor
+    # Error raised by this runner
+    Error = Class.new(StandardError)
+    desc 'version', 'wonder_scrape version'
+    def version
+      require_relative 'version'
+      puts "v#{WonderScrape::VERSION}"
+    end
+    map %w[--version -v] => :version
+    desc 'scrape', 'Scrape a target website for item data'
+    method_option :target, aliases: '-t', type: :string, banner: 'targetWebsite',
+                           desc: 'Sets the target website for scraping.',
+                           enum: WonderScrape::Commands::Scrape::VALID_SCRAPER_NAMES
+    method_option :output, aliases: '-o', type: :string, banner: 'csv',
+                           desc: 'Specifies the output format',
+                           enum: %w[csv json]
+    method_option :file, aliases: '-f', type: :string, banner: 'path/to/file',
+                         desc: 'Path to the file to write output to. Only necessary for CSV.'
+    method_option :num_pages, aliases: '-n', type: :numeric, banner: 2,
+                              desc: 'Expected number of pages for search results.'
+    method_option :start_page, aliases: '-s', type: :numeric, banner: 1,
+                               desc: 'What page of search results to begin scraping from.'
+    method_option :request_delay, aliases: '-d', type: :numeric, banner: 5,
+                                  desc: 'How long in seconds to wait between requests. Useful to avoid tripping rate limits.'
+    method_option :verbose, aliases: '-v', type: :boolean,
+                            desc: 'Runs in verbose mode, outputting in greater detail'
+    method_option :help, aliases: '-h', type: :boolean,
+                         desc: 'Display usage information'
+    def scrape(*)
+      if options[:help]
+        invoke :help, ['scrape']
+      else
+        WonderScrape::Commands::Scrape.new(options).execute
+      end
+    end
+  end
+end

data/lib/wonder_scrape/command.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+require 'forwardable'
+module WonderScrape
+  class Command
+    extend Forwardable
+    def_delegators :command, :run
+    # Execute this command
+    #
+    # @api public
+    def execute(*)
+      raise(
+        NotImplementedError,
+        "#{self.class}##{__method__} must be implemented"
+      )
+    end
+    # The external commands runner
+    #
+    # @see http://www.rubydoc.info/gems/tty-command
+    #
+    # @api public
+    def command(**options)
+      require 'tty-command'
+      TTY::Command.new(options)
+    end
+    # The interactive prompt
+    #
+    # @see http://www.rubydoc.info/gems/tty-prompt
+    #
+    # @api public
+    def prompt
+      require 'tty-prompt'
+      TTY::Prompt.new(interrupt: :exit)
+    end
+  end
+end

data/lib/wonder_scrape/commands/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@
1	+ #

data/lib/wonder_scrape/commands/scrape.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+require 'tty-progressbar'
+require_relative '../command'
+require_relative '../scrapers/mfc/scraper'
+require_relative '../writers/csv'
+require_relative '../writers/hash'
+require_relative '../recorder'
+module WonderScrape
+  module Commands
+    class Scrape < WonderScrape::Command
+      VALID_SCRAPER_NAMES = [
+        WonderScrape::Scrapers::MFC::Scraper::NAME
+      ].freeze
+      VALID_WRITERS = [
+        WonderScrape::Writers::CSV::NAME,
+        WonderScrape::Writers::Hash::NAME
+      ].freeze
+      def initialize(raw_options)
+        @raw_options = raw_options
+      end
+      def execute(input: $stdin, output: $stdout)
+        recorder = WonderScrape::Recorder.new(output, options)
+        writer = build_writer
+        scraper = build_scraper(writer, recorder)
+        scraper.scrape
+        writer.output_results
+        recorder.print
+      end
+      private
+      attr_reader :raw_options
+      def build_scraper(writer, recorder)
+        target_module.new(writer, recorder, options)
+      end
+      def build_writer
+        case output
+        when WonderScrape::Writers::CSV::NAME
+          WonderScrape::Writers::CSV.new(file, target_module::FIELDS)
+        when WonderScrape::Writers::Hash::NAME
+          WonderScrape::Writers::Hash.new
+        end
+      end
+      def target_module
+        @target_module ||= case target
+                           when WonderScrape::Scrapers::MFC::Scraper::NAME
+                             WonderScrape::Scrapers::MFC::Scraper
+        end
+      end
+      def progress_bar
+        TTY::ProgressBar.new('[:bar] :percent', total: approximate_records)
+      end
+      def target
+        @target ||= raw_options[:target] || prompt.select('What website would you like to scrape?', VALID_SCRAPER_NAMES)
+      end
+      def output
+        @output ||= raw_options[:format] || prompt.select('How would you like to output?', VALID_WRITERS)
+      end
+      def file
+        @file ||= raw_options[:file] || prompt.ask('Please specify the file path you want to write to:', required: true)
+      end
+      def options
+        @options ||= raw_options.merge({
+                                         progress_bar: progress_bar,
+                                         num_pages: num_pages
+                                       })
+      end
+      def approximate_records
+        target_module::RESULTS_PER_PAGE * num_pages
+      end
+      def num_pages
+        @num_pages ||= raw_options[:num_pages] || prompt.ask('How many pages of search results do you want to scrape?', default: target_module::DEFAULT_MAX_PAGES, convert: :int)
+      end
+    end
+  end
+end

data/lib/wonder_scrape/recorder.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+require 'json'
+class WonderScrape::Recorder
+  def initialize(output, options = {})
+    @output = output
+    @verbose = options[:verbose] || false
+    @progress_bar = options[:progress_bar]
+    @items_scraped = 0
+    @item_issues = {}
+    @unexpected_fields = []
+  end
+  def print
+    output.puts "Successfully processed #{items_scraped} items!"
+    if unexpected_fields.count > 0
+      output.puts "Encountered the following unexpected fields: #{unexpected_fields}"
+    end
+    if item_issues.count > 0
+      output.puts "Had issues with #{item_issues.count} items below"
+      output.puts JSON.pretty_generate(item_issues)
+    end
+  end
+  def increment_items_scraped(item)
+    @items_scraped += 1
+    if verbose
+      output.puts JSON.pretty_generate(item)
+    else
+      progress_bar&.advance(1)
+    end
+  end
+  def record_unexpected_field(item_id, field_name)
+    item_issues[item_id] ||= []
+    item_issues[item_id] << "Unexpected field: #{field_name}"
+    unexpected_fields << field_name
+  end
+  private
+  attr_reader :output, :verbose, :items_scraped, :progress_bar
+  attr_accessor :item_issues, :unexpected_fields
+end

data/lib/wonder_scrape/scrapers/mfc/field_parsers.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+require_relative 'mfc'
+module WonderScrape::Scrapers::MFC
+  module FieldParsers
+    class Standard
+      def self.parse(field_content)
+        field_content.text
+      end
+    end
+    class StandardList
+      def self.parse(field_content)
+        field_content.search('a').map(&:text)
+      end
+    end
+    class Price
+      def self.parse(field_content)
+        field_content.search('.item-price').text
+      end
+    end
+    class Dates
+      def self.parse(field_content)
+        field_content.search('a.time').map(&:text)
+      end
+    end
+    class Events
+      def self.parse(field_content)
+        field_content.search('a.item-entry > span').map(&:text)
+      end
+    end
+    class MainImage
+      def self.parse(field_content)
+        image_url = field_content.search('#content .item-picture a.main img').attr('src')
+        parsed_uri = URI.parse(image_url)
+        parsed_uri.query = nil
+        parsed_uri.path = parsed_uri.path.gsub('/big/', '/large/')
+        parsed_uri.to_s
+      end
+    end
+    class AdditionalImages
+      STYLE_URL_REGEX = /url\(([^\(\)]+)\)/.freeze
+      class << self
+        def parse(field_content)
+          field_content.search('#content .item-picture a.more').map do |image_link|
+            extract_clean_url(image_link.attr('style'))
+          end
+        end
+        private
+        def extract_clean_url(style_string)
+          image_url = style_string.scan(STYLE_URL_REGEX).flatten.first
+          parsed_uri = URI.parse(image_url)
+          parsed_uri.query = nil
+          parsed_uri.path = parsed_uri.path.gsub('/thumbnails/', '/')
+          parsed_uri.to_s
+        end
+      end
+    end
+  end
+end

data/lib/wonder_scrape/scrapers/mfc/item_parser.rb ADDED Viewed

@@ -0,0 +1,146 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require_relative 'mfc'
+require_relative 'field_parsers'
+module WonderScrape::Scrapers::MFC
+  class ItemParser
+    DUPLICATE_FIELD_NAMES = {
+      'Artist' => 'Artists',
+      'Character' => 'Characters',
+      'Classification' => 'Classifications',
+      'Event' => 'Events',
+      'Material' => 'Materials',
+      'Release date' => 'Release dates'
+    }.freeze
+    VALID_FIELD_NAMES = [
+      'Title',
+      'Artists',
+      'Category',
+      'Characters',
+      'Classifications',
+      'Company',
+      'Events',
+      'JAN',
+      'Materials',
+      'Numbering',
+      'Origin',
+      'Price',
+      'Release dates',
+      'Scale & Dimensions',
+      'Various',
+      'Version',
+      'Images'
+    ].freeze
+    ID_SELECTOR = '#content #ariadne > a.current'
+    TITLE_SELECTOR = '#content h1 span.headline'
+    FIELD_ELEMENTS_SELECTOR = '#content .data > .form > .form-field'
+    FIELD_NAME_SELECTOR = '.form-label'
+    FIELD_CONTENT_SELECTOR = '.form-input'
+    def self.parse(writer, recorder)
+      proc do |item_html_text|
+        item_html = ::Nokogiri::HTML(item_html_text)
+        new(writer, recorder, item_html).parse
+      end
+    end
+    def initialize(writer, recorder, item_html)
+      @writer = writer
+      @recorder = recorder
+      @item_html = item_html
+      @unexpected_fields = []
+    end
+    def parse
+      result = {}
+      result['Title'] = parsed_title
+      result.merge! parsed_fields
+      result['Images'] = parsed_images
+      writer.write(result)
+      recorder.increment_items_scraped(result)
+    end
+    private
+    attr_reader :writer, :recorder, :item_html
+    def parsed_id
+      id_element.text
+    end
+    def parsed_title
+      title_element.text
+    end
+    def parsed_fields
+      fields = {}
+      field_elements.each do |field_element|
+        field_name = dedupe_field_name(field_name_for(field_element))
+        if unexpected_field?(field_name)
+          recorder.record_unexpected_field(parsed_id, field_name)
+          next
+        end
+        field_content_element = field_content_element_for(field_element)
+        field_value = case field_name
+                      when 'Price'
+                        FieldParsers::Price.parse(field_content_element)
+                      when 'Release dates'
+                        FieldParsers::Dates.parse(field_content_element)
+                      when 'Events'
+                        FieldParsers::Events.parse(field_content_element)
+                      when 'Artists', 'Characters', 'Classifications', 'Materials'
+                        FieldParsers::StandardList.parse(field_content_element)
+                      else
+                        FieldParsers::Standard.parse(field_content_element)
+        end
+        fields[field_name] = field_value
+      end
+      fields
+    end
+    def parsed_images
+      images = []
+      images << FieldParsers::MainImage.parse(item_html)
+      images.concat FieldParsers::AdditionalImages.parse(item_html)
+      images.compact.uniq
+    end
+    def id_element
+      item_html.search(ID_SELECTOR)
+    end
+    def title_element
+      item_html.search(TITLE_SELECTOR)
+    end
+    def field_elements
+      item_html.search(FIELD_ELEMENTS_SELECTOR)
+    end
+    def field_name_for(field_element)
+      field_element.search(FIELD_NAME_SELECTOR).text
+    end
+    def dedupe_field_name(field_name)
+      DUPLICATE_FIELD_NAMES[field_name] || field_name
+    end
+    def field_content_element_for(field_element)
+      field_element.search(FIELD_CONTENT_SELECTOR)
+    end
+    def unexpected_field?(field_name)
+      !VALID_FIELD_NAMES.include?(field_name)
+    end
+  end
+end

data/lib/wonder_scrape/scrapers/mfc/mfc.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+require_relative '../scrapers'
+module WonderScrape::Scrapers::MFC; end

data/lib/wonder_scrape/scrapers/mfc/scraper.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+require 'upton'
+require_relative 'mfc'
+require_relative 'item_parser'
+module WonderScrape::Scrapers::MFC
+  class Scraper
+    NAME = 'MFC'
+    FIELDS = ItemParser::VALID_FIELD_NAMES
+    BASE_URL = 'myfigurecollection.net'
+    SEARCH_PATH = '/browse.v4.php'
+    SEARCH_RESULT_ITEM_SELECTOR = 'ul.listing div.item-icons span.item-icon > a.tbx-tooltip'
+    RESULTS_PER_PAGE = 81
+    DEFAULT_DELAY_BETWEEN_REQUESTS = 2 # seconds
+    DEFAULT_MAX_PAGES = 2
+    DEFAULT_START_PAGE = 1
+    DEFAULT_SEARCH_CATEGORY = 4 # Garage kits
+    def initialize(writer, recorder, options = {})
+      @writer = writer
+      @recorder = recorder
+      @options = options
+    end
+    def scrape
+      scraper.scrape(&ItemParser.parse(writer, recorder))
+    end
+    private
+    attr_reader :writer, :recorder, :options
+    def scraper
+      @scraper ||= build_scraper
+    end
+    def build_scraper
+      new_scraper = Upton::Scraper.new(
+        search_url,
+        SEARCH_RESULT_ITEM_SELECTOR
+      )
+      new_scraper.paginated = true
+      new_scraper.pagination_start_index = options[:start_page] || DEFAULT_START_PAGE
+      new_scraper.pagination_max_pages = options[:num_pages] || DEFAULT_MAX_PAGES
+      new_scraper.verbose = options[:verbose] || false
+      new_scraper.sleep_time_between_requests = options[:request_delay] || DEFAULT_DELAY_BETWEEN_REQUESTS
+      new_scraper
+    end
+    def search_url
+      URI::HTTPS.build(
+        host: BASE_URL,
+        path: SEARCH_PATH,
+        query: build_search_query_params
+      ).to_s
+    end
+    def build_search_query_params
+      URI.encode_www_form({
+                            'mode': 'search',
+                            'categoryId': DEFAULT_SEARCH_CATEGORY,
+                            'sort': 'date',
+                            'order': 'desc'
+                          })
+    end
+  end
+end

data/lib/wonder_scrape/scrapers/scrapers.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+require 'wonder_scrape'
+module WonderScrape::Scrapers; end

data/lib/wonder_scrape/templates/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@
1	+ #

data/lib/wonder_scrape/templates/scrape/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@
1	+ #

data/lib/wonder_scrape/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module WonderScrape
+  VERSION = '0.1.0'
+end

data/lib/wonder_scrape/writers/csv.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+require 'csv'
+require_relative 'writers'
+class WonderScrape::Writers::CSV
+  NAME = 'csv'
+  def initialize(file_name, headers)
+    @headers = headers
+    @csv = build_csv_writer(file_name)
+  end
+  def write(entry)
+    csv << entry.values_at(*headers)
+  end
+  def output_results
+    csv.close
+  end
+  private
+  attr_reader :headers
+  attr_accessor :csv
+  def build_csv_writer(file_name)
+    new_csv = CSV.open(file_name, 'wb')
+    new_csv << headers
+    new_csv
+  end
+end

data/lib/wonder_scrape/writers/hash.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+require 'json'
+require_relative 'writers'
+class WonderScrape::Writers::Hash
+  NAME = 'hash'
+  def initialize
+    @results = []
+  end
+  attr_reader :results
+  def write(entry)
+    @results << entry
+  end
+  def output_results
+    puts JSON.pretty_generate(@results)
+  end
+end

data/lib/wonder_scrape/writers/writers.rb ADDED Viewed

@@ -0,0 +1,3 @@
+# frozen_string_literal: true
+module WonderScrape::Writers; end

data/wonder_scrape.gemspec ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+require_relative 'lib/wonder_scrape/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'wonder_scrape'
+  spec.version       = WonderScrape::VERSION
+  spec.authors       = ['Ben Dawson']
+  spec.email         = ['bendawson.rb@gmail.com']
+  spec.summary       = 'A project to collect useful information from figure collecting websites.'
+  spec.homepage      = 'https://gitlab.com/maleckai/wonder_scrape'
+  spec.license       = 'MIT'
+  spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
+  spec.metadata['homepage_uri'] = spec.homepage
+  spec.metadata['source_code_uri'] = spec.homepage
+  spec.metadata['changelog_uri'] = "#{spec.homepage}/-/blob/master/CHANGELOG.md"
+  spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
+  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.add_dependency 'thor'
+  spec.add_dependency 'tty-progressbar'
+  spec.add_dependency 'tty-prompt'
+  spec.add_runtime_dependency 'nokogiri', ['~> 1.10.9']
+  spec.add_runtime_dependency 'upton', ['~> 0.3.6']
+end

metadata ADDED Viewed

@@ -0,0 +1,150 @@
+--- !ruby/object:Gem::Specification
+name: wonder_scrape
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Ben Dawson
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2020-05-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: tty-progressbar
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: tty-prompt
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.10.9
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.10.9
+- !ruby/object:Gem::Dependency
+  name: upton
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.3.6
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.3.6
+description:
+email:
+- bendawson.rb@gmail.com
+executables:
+- wonder_scrape
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".ruby-version"
+- ".travis.yml"
+- CHANGELOG.md
+- CODE_OF_CONDUCT.md
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- exe/wonder_scrape
+- lib/wonder_scrape.rb
+- lib/wonder_scrape/cli.rb
+- lib/wonder_scrape/command.rb
+- lib/wonder_scrape/commands/.gitkeep
+- lib/wonder_scrape/commands/scrape.rb
+- lib/wonder_scrape/recorder.rb
+- lib/wonder_scrape/scrapers/mfc/field_parsers.rb
+- lib/wonder_scrape/scrapers/mfc/item_parser.rb
+- lib/wonder_scrape/scrapers/mfc/mfc.rb
+- lib/wonder_scrape/scrapers/mfc/scraper.rb
+- lib/wonder_scrape/scrapers/scrapers.rb
+- lib/wonder_scrape/templates/.gitkeep
+- lib/wonder_scrape/templates/scrape/.gitkeep
+- lib/wonder_scrape/version.rb
+- lib/wonder_scrape/writers/csv.rb
+- lib/wonder_scrape/writers/hash.rb
+- lib/wonder_scrape/writers/writers.rb
+- wonder_scrape.gemspec
+homepage: https://gitlab.com/maleckai/wonder_scrape
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://gitlab.com/maleckai/wonder_scrape
+  source_code_uri: https://gitlab.com/maleckai/wonder_scrape
+  changelog_uri: https://gitlab.com/maleckai/wonder_scrape/-/blob/master/CHANGELOG.md
+  allowed_push_host: https://rubygems.org
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.3.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.1.2
+signing_key:
+specification_version: 4
+summary: A project to collect useful information from figure collecting websites.
+test_files: []