RubyGems - spidy - Versions diffs - 0.0.1 - Mend

spidy 0.0.1

Files changed (32) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +3 -0
data/.rubocop.yml +23 -0
data/.rubocop_todo.yml +13 -0
data/.ruby-version +1 -0
data/.travis.yml +7 -0
data/CHANGELOG.md +9 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +6 -0
data/Gemfile.lock +87 -0
data/LICENSE.txt +21 -0
data/README.md +43 -0
data/Rakefile +8 -0
data/bin/console +22 -0
data/bin/setup +8 -0
data/exe/spidy +17 -0
data/lib/spidy.rb +47 -0
data/lib/spidy/binder.rb +77 -0
data/lib/spidy/connector.rb +10 -0
data/lib/spidy/connector/html.rb +42 -0
data/lib/spidy/connector/xml.rb +31 -0
data/lib/spidy/console.rb +21 -0
data/lib/spidy/definition.rb +103 -0
data/lib/spidy/definition_file.rb +43 -0
data/lib/spidy/looper.rb +22 -0
data/lib/spidy/result.rb +23 -0
data/lib/spidy/shell.rb +79 -0
data/lib/spidy/spider.rb +17 -0
data/lib/spidy/version.rb +5 -0
data/spidy.gemspec +36 -0
metadata +186 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: a1034253dcc3f68d566c3b67ff9ec5c6aeca4ec1b6a2ed66723bda8041154011
+  data.tar.gz: '08ef4a5426111b1824c5547465d0473507f68d9f0ea499bacddc4411395dd25a'
+SHA512:
+  metadata.gz: '01745823727ff14e7b8a4fc97a0487fa32000ae7e09c0241a4bceeab5722df060162b275e99fe77991139788e41f7cb46d1f9d113c5cf93d96efc98855910af3'
+  data.tar.gz: ae0d7b3b6707b939f83e1b8e453c0ad87c60faec363017665ddb38baf77f763ed44994f3223208e92545eaa362b8ae28bcafb5b0c818b050c35aa221d87e00b7

data/.gitignore ADDED

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# example crawlers
+examples/
+# rspec failure tracking
+.rspec_status

data/.rspec ADDED

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.rubocop.yml ADDED

@@ -0,0 +1,23 @@
+inherit_from: .rubocop_todo.yml
+AllCops:
+  DisplayCopNames: true
+  TargetRubyVersion: 2.6
+Style/ClassAndModuleChildren:
+  Enabled: false
+Style/SignalException:
+  EnforcedStyle: semantic
+Naming/UncommunicativeMethodParamName:
+  AllowedNames:
+    - as
+Metrics/LineLength:
+  Max: 120
+Metrics/BlockLength:
+  Max: 120
+SignalException:
+  EnforcedStyle: semantic

data/.rubocop_todo.yml ADDED

@@ -0,0 +1,13 @@
+# This configuration was generated by
+# `rubocop --auto-gen-config`
+# on 2019-03-29 18:00:03 +0900 using RuboCop version 0.66.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 7
+# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
+# URISchemes: http, https
+Metrics/LineLength:
+  Max: 96

data/.ruby-version ADDED

	@@ -0,0 +1 @@
1	+ 2.6.2

data/.travis.yml ADDED

@@ -0,0 +1,7 @@
+---
+sudo: false
+language: ruby
+cache: bundler
+rvm:
+  - 2.6.2
+before_install: gem install bundler -v 2.0.1

data/CHANGELOG.md ADDED

@@ -0,0 +1,9 @@
+# Change Log
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](http://keepachangelog.com/)
+and this project adheres to [Semantic Versioning](http://semver.org/).
+## [Unreleased]
+## [0.1.0]

data/CODE_OF_CONDUCT.md ADDED

@@ -0,0 +1,74 @@
+# Contributor Covenant Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of experience,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at aileron.cc@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at [http://contributor-covenant.org/version/1/4][version]
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+source 'https://rubygems.org'
+# Specify your gem's dependencies in crawler.gemspec
+gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,87 @@
+PATH
+  remote: .
+  specs:
+    spidy (0.0.1)
+      activemodel (~> 5.2)
+      activesupport (~> 5.2)
+      mechanize
+      pry
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activemodel (5.2.3)
+      activesupport (= 5.2.3)
+    activesupport (5.2.3)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+    coderay (1.1.2)
+    concurrent-ruby (1.1.5)
+    connection_pool (2.2.2)
+    diff-lcs (1.3)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    i18n (1.6.0)
+      concurrent-ruby (~> 1.0)
+    mechanize (2.7.6)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (>= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (>= 2.5.2)
+      nokogiri (~> 1.6)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    method_source (0.9.2)
+    mime-types (3.2.2)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2019.0331)
+    mini_portile2 (2.4.0)
+    minitest (5.11.3)
+    net-http-digest_auth (1.4.1)
+    net-http-persistent (3.1.0)
+      connection_pool (~> 2.2)
+    nokogiri (1.10.4)
+      mini_portile2 (~> 2.4.0)
+    ntlm-http (0.1.1)
+    pry (0.12.2)
+      coderay (~> 1.1.0)
+      method_source (~> 0.9.0)
+    rake (10.5.0)
+    rspec (3.8.0)
+      rspec-core (~> 3.8.0)
+      rspec-expectations (~> 3.8.0)
+      rspec-mocks (~> 3.8.0)
+    rspec-core (3.8.0)
+      rspec-support (~> 3.8.0)
+    rspec-expectations (3.8.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.8.0)
+    rspec-mocks (3.8.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.8.0)
+    rspec-support (3.8.0)
+    thread_safe (0.3.6)
+    tzinfo (1.2.5)
+      thread_safe (~> 0.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.6)
+    webrobots (0.1.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 2.0)
+  pry
+  rake (~> 10.0)
+  rspec (~> 3.0)
+  spidy!
+BUNDLED WITH
+   2.0.2

data/LICENSE.txt ADDED

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2019 aileron
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,43 @@
+# Spidy
+Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/crawler`. To experiment with that code, run `bin/console` for an interactive prompt.
+TODO: Delete this and the text above, and describe your gem
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'spidy'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install spidy
+## Usage
+TODO: Write usage instructions here
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/crawler. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
+## Code of Conduct
+Everyone interacting in the Crawler project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/crawler/blob/master/CODE_OF_CONDUCT.md).

data/Rakefile ADDED

@@ -0,0 +1,8 @@
+# frozen_string_literal: true
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/bin/console ADDED

@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+require 'bundler/setup'
+require 'spidy'
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+require 'pry'
+def reload!
+  ActiveSupport::Dependencies.clear
+  ActiveSupport::DescendantsTracker.clear
+  ActiveSupport::Reloader.reload!
+end
+if ARGV[0]
+  Spidy.open(ARGV[0]).console
+else
+  Pry.start
+end

data/bin/setup ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/exe/spidy ADDED

@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+require 'spidy'
+case ARGV[0]&.to_sym
+when :spider then Spidy.open(ARGV[1]).shell.spider(ARGV[2])
+when :scraper then Spidy.open(ARGV[1]).shell.scraper(ARGV[2])
+when :shell then Spidy.open(ARGV[1]).shell.function
+when :new then Spidy.open(ARGV[1]).shell.build
+when :console
+  if ARGV[1].blank?
+    Spidy.console
+  else
+    Spidy.open(ARGV[1]).console
+  end
+end

data/lib/spidy.rb ADDED

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+require 'spidy/version'
+require 'active_support/all'
+require 'active_model'
+require 'mechanize'
+require 'csv'
+require 'open-uri'
+#
+# web spider dsl engine
+#
+module Spidy
+  extend ActiveSupport::Autoload
+  autoload :Shell
+  autoload :Console
+  autoload :Definition
+  autoload :DefinitionFile
+  autoload :Binder
+  autoload :Spider
+  autoload :Looper
+  autoload :Connector
+  autoload :Result
+  const_set(:Crawler, Module.new) unless const_defined?(:Crawler)
+  def self.console
+    require 'pry'
+    Pry.start(Spidy::Console.new)
+  end
+  def self.open(filepath)
+    ::Spidy::DefinitionFile.open(filepath)
+  end
+  def self.define(name = nil, domain: nil, &block)
+    crawler_definition = Class.new(::Spidy::Definition, &block)
+    crawler_definition.domain = domain
+    if name
+      crawler_class_name = name.to_s.camelize
+      Crawler.class_eval { remove_const(crawler_class_name) } if Crawler.const_defined?(crawler_class_name)
+      Crawler.const_set(crawler_class_name, crawler_definition)
+    end
+    crawler_definition
+  end
+end

data/lib/spidy/binder.rb ADDED

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+#
+# Bind resource received from the connection to the result object
+#
+class Spidy::Binder
+  #
+  # binding multiple
+  #
+  class Multiple
+    def self.bind(connector:, binder:, query:, block:)
+      multiple_binding_class = self
+      connector.field.call(binder, query) do |elements|
+        multiple_binding_class.new(binder.class).instance_exec(elements, &block)
+      end
+    end
+    def initialize(binder)
+      @binder = binder
+    end
+    def field(name)
+      @binder.field_names << name
+      @binder.field_names.uniq!
+      @binder.result_class.define(name)
+      result = yield
+      @binder.define_method(name) { result }
+    end
+  end
+  class_attribute :field_names, default: []
+  attr_reader :resource
+  def initialize(resource)
+    @resource = resource
+    self.class.fields_call(self)
+  end
+  def result
+    definition = self
+    fetched_at = Time.current
+    result = self.class.result_class.new(fetched_at: fetched_at, fetched_on: fetched_at.beginning_of_day, **attributes)
+    result.define_singleton_method(:resource) { definition.resource }
+    result
+  end
+  def attributes_to_array
+    field_names.map { |field_name| send(field_name) }
+  end
+  def attributes
+    field_names.map { |field_name| [field_name, send(field_name)] }.to_h
+  end
+  def self.query(name, query = nil, &block)
+    define_method(name) do
+      connector.field.call(self, query, &block)
+    end
+  end
+  def self.field(name, query = nil, optional: false, &block)
+    field_names << name
+    field_names.uniq!
+    result_class.define(name, presence: !optional)
+    define_method(name) do
+      connector.field.call(self, query, &block)
+    end
+  end
+  def self.fields(query, &block)
+    @fields = { query: query, block: block }
+  end
+  def self.fields_call(binder)
+    Multiple.bind(connector: connector, binder: binder, query: @fields[:query], block: @fields[:block]) if @fields
+  end
+end

data/lib/spidy/connector.rb ADDED

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+#
+# This class is responsible for actually making a network connection and downloading hypertext
+#
+module Spidy::Connector
+  extend ActiveSupport::Autoload
+  autoload :Html
+  autoload :Xml
+end

data/lib/spidy/connector/html.rb ADDED

@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+#
+# Mechanize wrapper
+#
+class Spidy::Connector::Html
+  class_attribute :field, default: (lambda { |object, query, &block|
+    node = object.resource.search(query)
+    fail "Could not be located #{query}" if node.nil?
+    return node.first.text if block.nil?
+    object.instance_exec(node, &block)
+  })
+  USER_AGENT = [
+    'Mozilla/5.0',
+    '(Macintosh; Intel Mac OS X 10_12_6)',
+    'AppleWebKit/537.36',
+    '(KHTML, like Gecko)',
+    'Chrome/64.0.3282.186',
+    'Safari/537.36'
+  ].join(' ')
+  attr_reader :start_url
+  attr_reader :agent
+  def initialize(start_url: nil, encoding: nil)
+    @start_url = start_url
+    @agent = Mechanize.new
+    if encoding
+      @agent.default_encoding = encoding
+      @agent.force_default_encoding = true
+    end
+    @agent.user_agent = USER_AGENT
+  end
+  def call(url = @start_url, &block)
+    fail 'URL is undefined' if url.blank?
+    agent.get(url, &block)
+  end
+end

data/lib/spidy/connector/xml.rb ADDED

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+#
+# xml
+#
+class Spidy::Connector::Xml
+  class_attribute :field, default: (lambda { |object, query, optional: false, &block|
+    return object.instance_exec(object.resource, &block) if query.nil?
+    node = object.resource.search(query)
+    return if optional && node.empty?
+    fail "Could not be located #{query}" if node.empty?
+    return node.first.text if block.nil?
+    object.instance_exec(node, &block)
+  })
+  def initialize(start_url: nil, encoding: nil)
+    @start_url = start_url
+    @encoding = encoding
+  end
+  def call(url = @start_url)
+    fail 'URL is undefined' if url.blank?
+    xml =
+      Nokogiri::XML(OpenURI.open_uri(url).read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''))
+    yield xml
+  end
+end

data/lib/spidy/console.rb ADDED

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+#
+# spidy console
+#
+class Spidy::Console
+  attr_reader :definition_file
+  delegate :spiders, :scrapers, to: :definition_file
+  def initialize(definition_file = nil)
+    @definition_file = definition_file
+  end
+  def open(filepath)
+    @definition_file = Spidy::DefinitionFile.open(filepath)
+  end
+  def reload!
+    @definition_file&.eval_definition
+  end
+end

data/lib/spidy/definition.rb ADDED

@@ -0,0 +1,103 @@
+# frozen_string_literal: true
+#
+# Class representing a website defined by DSL
+#
+class Spidy::Definition
+  class_attribute :domain
+  class_attribute :spiders, default: {}
+  class_attribute :scrapers, default: {}
+  class_attribute :output, default: ->(result) { STDOUT.puts(result.attributes.to_json) }
+  def output(&block)
+    self.output = block
+  end
+  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
+  class << self
+    def spider(name, start_url = nil, encoding: nil, as: :html, &block)
+      connector_class = Spidy::Connector.const_get(as.to_s.classify)
+      connector = connector_class.new(start_url: start_url, encoding: encoding)
+      spider = Spidy::Spider.new(&block)
+      spider_class = Class.new do
+        define_singleton_method(:connector) { connector }
+        define_singleton_method(:call) do |url = start_url, &spider_block|
+          connector.call(url) do |resource|
+            spider.call(resource, &spider_block)
+          end
+        end
+      end
+      const_set("#{name}_spider".classify, spider_class)
+      spiders[name] = spider_class
+    end
+    def scraper(name, options, &block)
+      if options[:loop]
+        loop_scraper(name, options, &block)
+      else
+        normal_scraper(name, **options, &block)
+      end
+    end
+    private
+    def loop_scraper(name, options, &block)
+      options = { as: :html, start_url: nil, encoding: nil, loop: nil }.merge(options)
+      result_class = Class.new(Spidy::Result)
+      # connector
+      connector_class = Spidy::Connector.const_get(options[:as].to_s.classify)
+      connector = connector_class.new(encoding: options[:encoding])
+      namespace = Class.new do
+        binder = Class.new(Spidy::Binder) do
+          define_singleton_method(:connector) { connector }
+          define_singleton_method(:result_class) { result_class }
+          define_method(:connector) { connector }
+          instance_exec(&block)
+        end
+        define_singleton_method(:call) do |url = options[:start_url], &yielder|
+          connector.call(url) do |resource|
+            looper = Spidy::Looper.new(resource, binder, options[:loop])
+            looper.call(&yielder)
+          end
+        end
+      end
+      const_set("#{name}_scraper".classify, namespace)
+      scrapers[name] = namespace
+    end
+    def normal_scraper(name, encoding: nil, as: :html, &block)
+      # result
+      result_class = Class.new(Spidy::Result)
+      # connector
+      connector_class = Spidy::Connector.const_get(as.to_s.classify)
+      connector = connector_class.new(encoding: encoding)
+      # namespace
+      namespace = Class.new do
+        binder = Class.new(Spidy::Binder) do
+          define_singleton_method(:connector) { connector }
+          define_singleton_method(:result_class) { result_class }
+          define_method(:connector) { connector }
+          instance_exec(&block)
+        end
+        define_singleton_method(:bind) do |url|
+          connector.call(url) do |resource|
+            binder.new(resource)
+          end
+        end
+        define_singleton_method(:call) do |url, &output|
+          result = bind(url).result
+          fail "#{url}\n#{result.errors.full_messages}" if result.invalid?
+          output.call(result)
+        end
+      end
+      const_set("#{name}_scraper".classify, namespace)
+      scrapers[name] = namespace
+    end
+  end
+  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
+end

data/lib/spidy/definition_file.rb ADDED

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+#
+# spidy interface binding
+#
+class Spidy::DefinitionFile
+  attr_reader :path
+  attr_reader :definition
+  delegate :spiders, :scrapers, :output, to: :definition
+  CSV = lambda do |result|
+    ::CSV.generate do |csv|
+      csv << result.definition.attributes_to_array
+    end
+  end
+  def self.open(filepath)
+    object = new(filepath)
+    object.eval_definition
+    object
+  end
+  # rubocop:disable Security/Eval
+  def eval_definition
+    @definition = eval(File.open(path).read)
+  end
+  # rubocop:enable Security/Eval
+  def shell
+    @shell ||= Spidy::Shell.new(self)
+  end
+  def console
+    require 'pry'
+    Pry.start(Spidy::Console.new(self))
+  end
+  private
+  def initialize(path)
+    @path = path
+  end
+end

data/lib/spidy/looper.rb ADDED

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+#
+# looper
+#
+class Spidy::Looper
+  def initialize(resource, binder, loop_block)
+    @resource = resource
+    @binder = binder
+    @loop_block = loop_block
+  end
+  def call
+    yielder = lambda do |element|
+      result = @binder.new(element).result
+      fail "#{element}\n\n#{result.errors.full_messages}" if result.invalid?
+      yield result
+    end
+    @loop_block.call(@resource, yielder)
+  end
+end

data/lib/spidy/result.rb ADDED

@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+#
+# Scrape results
+#
+class Spidy::Result
+  include ActiveModel::Model
+  include ActiveModel::Attributes
+  def self.define(name, presence: true)
+    case name
+    when /.*\?/
+      attribute name, :boolean
+      validates name, inclusion: { in: [true, false] } if presence
+    else
+      attribute name
+      validates name, presence: true, allow_blank: true if presence
+    end
+  end
+  attribute :fetched_at
+  attribute :fetched_on
+end

data/lib/spidy/shell.rb ADDED

@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+#
+# spidy shell interface
+#
+class Spidy::Shell
+  attr_reader :definition_file
+  delegate :spiders, :scrapers, to: :definition_file
+  def initialize(definition_file)
+    @definition_file = definition_file
+  end
+  # rubocop:disable Lint/AssignmentInCondition, Style/RescueStandardError
+  def scraper(name)
+    command = scrapers[name.to_sym]
+    fail "undefined commmand[#{name}]" if command.nil?
+    while line = STDIN.gets
+      url = line.strip
+      begin
+        command.call(url, &definition_file.output)
+      rescue => e
+        STDERR.puts "ERROR #{url}: #{e}\n#{e.backtrace}"
+      end
+    end
+  end
+  # rubocop:enable Lint/AssignmentInCondition, Style/RescueStandardError
+  def spider(name)
+    command = spiders[name.to_sym]
+    if File.pipe?(STDIN)
+      STDIN.each_line do |line|
+        start_url = line.strip
+        command.call(start_url) { |url| puts url }
+      end
+    else
+      command.call { |url| puts url }
+    end
+  end
+  def function
+    print <<~SHELL
+      function spider() {
+        spidy spider #{definition_file.path} $1
+      }
+      function scraper() {
+        spidy scraper #{definition_file.path} $1
+      }
+    SHELL
+  end
+  # rubocop:disable Metrics/MethodLength
+  def build(name)
+    File.open("#{name}.rb", 'w') do |f|
+      f.write <<~RUBY
+        # frozen_string_literal: true
+        Spidy.define(:#{name}) do
+          spider(:example, 'http://example.com') do |html, yielder|
+            #  yielder.call(url or resource)
+          end
+          scraper(:example) do
+          end
+        end
+      RUBY
+    end
+    File.open("#{name}.sh", 'w') do |f|
+      f.write <<~SHELL
+        #!/bin/bash
+        eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
+        spider example
+      SHELL
+    end
+  end
+  # rubocop:enable Metrics/MethodLength
+end

data/lib/spidy/spider.rb ADDED

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+#
+# Spider
+#
+class Spidy::Spider
+  def initialize(&block)
+    define_singleton_method(:bind, &block)
+  end
+  def call(resource)
+    yielder = lambda do |url|
+      yield url if block_given?
+    end
+    bind(resource, yielder)
+  end
+end

data/lib/spidy/version.rb ADDED

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Spidy
+  VERSION = '0.0.1'
+end

data/spidy.gemspec ADDED

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+lib = File.expand_path('lib', __dir__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'spidy/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'spidy'
+  spec.version       = Spidy::VERSION
+  spec.authors       = ['aileron']
+  spec.email         = ['aileron.cc@gmail.com']
+  spec.summary       = 'web spider dsl'
+  # spec.description   = 'TODO: Write a longer description or delete this line.'
+  spec.homepage      = 'https://github.com/aileron-inc/spidy'
+  spec.license       = 'MIT'
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.add_development_dependency 'bundler', '~> 2.0'
+  spec.add_development_dependency 'pry'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+  spec.add_runtime_dependency 'activemodel', '~> 5.2'
+  spec.add_runtime_dependency 'activesupport', '~> 5.2'
+  spec.add_runtime_dependency 'mechanize'
+  spec.add_runtime_dependency 'pry'
+end

metadata ADDED

@@ -0,0 +1,186 @@
+--- !ruby/object:Gem::Specification
+name: spidy
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- aileron
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2019-08-21 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: activemodel
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.2'
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.2'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description:
+email:
+- aileron.cc@gmail.com
+executables:
+- spidy
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".rubocop.yml"
+- ".rubocop_todo.yml"
+- ".ruby-version"
+- ".travis.yml"
+- CHANGELOG.md
+- CODE_OF_CONDUCT.md
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- exe/spidy
+- lib/spidy.rb
+- lib/spidy/binder.rb
+- lib/spidy/connector.rb
+- lib/spidy/connector/html.rb
+- lib/spidy/connector/xml.rb
+- lib/spidy/console.rb
+- lib/spidy/definition.rb
+- lib/spidy/definition_file.rb
+- lib/spidy/looper.rb
+- lib/spidy/result.rb
+- lib/spidy/shell.rb
+- lib/spidy/spider.rb
+- lib/spidy/version.rb
+- spidy.gemspec
+homepage: https://github.com/aileron-inc/spidy
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.0.3
+signing_key:
+specification_version: 4
+summary: web spider dsl
+test_files: []