RubyGems - openc_bot - Versions diffs - 0.0.11 - Mend

openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

checksums.yaml +7 -0
data/.gitignore +22 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +2 -0
data/Gemfile +8 -0
data/LICENSE.txt +22 -0
data/README.md +253 -0
data/Rakefile +14 -0
data/bin/openc_bot +13 -0
data/create_bot.sh +30 -0
data/create_company_bot.sh +16 -0
data/create_simple_licence_bot.sh +31 -0
data/db/.gitkeep +0 -0
data/examples/basic/.gitignore +3 -0
data/examples/basic/Gemfile +7 -0
data/examples/basic/config.yml +21 -0
data/examples/basic/lib/basic.rb +88 -0
data/examples/basic_with_proxy/Gemfile +7 -0
data/examples/basic_with_proxy/config.yml +21 -0
data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
data/examples/bot_with_simple_iterator/Gemfile +6 -0
data/examples/bot_with_simple_iterator/config.yml +21 -0
data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
data/examples/company_fetchers/basic.rb +49 -0
data/lib/monkey_patches/mechanize.rb +53 -0
data/lib/openc_bot.rb +89 -0
data/lib/openc_bot/bot_data_validator.rb +18 -0
data/lib/openc_bot/company_fetcher_bot.rb +40 -0
data/lib/openc_bot/exceptions.rb +17 -0
data/lib/openc_bot/helpers/_csv.rb +10 -0
data/lib/openc_bot/helpers/alpha_search.rb +73 -0
data/lib/openc_bot/helpers/dates.rb +33 -0
data/lib/openc_bot/helpers/html.rb +8 -0
data/lib/openc_bot/helpers/incremental_search.rb +106 -0
data/lib/openc_bot/helpers/register_methods.rb +205 -0
data/lib/openc_bot/helpers/text.rb +18 -0
data/lib/openc_bot/incrementers.rb +2 -0
data/lib/openc_bot/incrementers/base.rb +214 -0
data/lib/openc_bot/incrementers/common.rb +47 -0
data/lib/openc_bot/tasks.rb +385 -0
data/lib/openc_bot/templates/README.md +35 -0
data/lib/openc_bot/templates/bin/export_data +28 -0
data/lib/openc_bot/templates/bin/fetch_data +23 -0
data/lib/openc_bot/templates/bin/verify_data +1 -0
data/lib/openc_bot/templates/config.yml +21 -0
data/lib/openc_bot/templates/lib/bot.rb +43 -0
data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
data/lib/openc_bot/version.rb +3 -0
data/lib/simple_openc_bot.rb +289 -0
data/openc_bot.gemspec +35 -0
data/schemas/company-schema.json +112 -0
data/schemas/includes/address.json +23 -0
data/schemas/includes/base-statement.json +27 -0
data/schemas/includes/company.json +14 -0
data/schemas/includes/filing.json +20 -0
data/schemas/includes/license-data.json +27 -0
data/schemas/includes/officer.json +14 -0
data/schemas/includes/previous_name.json +11 -0
data/schemas/includes/share-parcel-data.json +67 -0
data/schemas/includes/share-parcel.json +60 -0
data/schemas/includes/subsidiary-relationship-data.json +52 -0
data/schemas/includes/total-shares.json +10 -0
data/schemas/licence-schema.json +21 -0
data/schemas/share-parcel-schema.json +21 -0
data/schemas/subsidiary-relationship-schema.json +19 -0
data/spec/dummy_classes/foo_bot.rb +4 -0
data/spec/lib/bot_data_validator_spec.rb +69 -0
data/spec/lib/company_fetcher_bot_spec.rb +93 -0
data/spec/lib/exceptions_spec.rb +25 -0
data/spec/lib/helpers/alpha_search_spec.rb +173 -0
data/spec/lib/helpers/dates_spec.rb +65 -0
data/spec/lib/helpers/incremental_search_spec.rb +471 -0
data/spec/lib/helpers/register_methods_spec.rb +558 -0
data/spec/lib/helpers/text_spec.rb +50 -0
data/spec/lib/openc_bot/db/.gitkeep +0 -0
data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
data/spec/lib/openc_bot_spec.rb +116 -0
data/spec/schemas/company-schema_spec.rb +676 -0
data/spec/simple_openc_bot_spec.rb +302 -0
data/spec/spec_helper.rb +19 -0
metadata +300 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 2c2492325f145ede40b77b666ab93b99c47ba314
+  data.tar.gz: 7aa0dd5faf896d3a7e6a2217092ced8227084e8b
+SHA512:
+  metadata.gz: 9c2709f8c3cb91d06d6e356809e8adc2e16dd5499b5c85e217fe6637c2c4045b6e6add769ffded9844227ef16a04ea387f0d252a4d4ffeb80fd6cea5876f4faf
+  data.tar.gz: 94d5a2d6222a04164ee5f93cb266129790651f3aae587faa20c52af81d4fbdbb15d899b11eb42a2dd2f06aa075d80bf282e1a503d39b3400b07f9da016061271

data/.gitignore ADDED Viewed

@@ -0,0 +1,22 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+.DS_Store
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+*~
+db/*
+**/db/*
+!.gitkeep

data/.travis.yml ADDED Viewed

@@ -0,0 +1,8 @@
+language: ruby
+rvm:
+  - "1.9.2"
+  - "1.9.3"
+  - "2.1.0"
+  # - jruby-18mode # JRuby in 1.8 mode
+  # - jruby-19mode # JRuby in 1.9 mode
+  # - rbx

data/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # 0.0.1
2	+ * Initial commit

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+source 'https://rubygems.org'
+gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
+gem "pry", :group => [:development,:test]
+# Specify your gem's dependencies in openc_bot.gemspec
+gemspec
+# we need to do pull request and bump version
+# gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Chris Taggart
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,253 @@
+# OpencBot
+## Overview
+This is a gem to allow bots to be written to fetch and format data
+that can be easily imported into OpenCorporates, the largest openly
+licensed database of companies in the world.
+To start writing a new bot, run the following to create a skeleton bot:
+```bash
+mkdir your_bot_name
+cd your_bot_name
+curl -s https://raw.githubusercontent.com/openc/openc_bot/master/create_simple_licence_bot.sh | bash
+```
+The default bot doesn't scrape, it just outputs some dummy data. You can try:
+* running the scrape with `bundle exec openc_bot rake bot:run`
+* testing the validity of the data it will output with
+  `bundle exec openc_bot rake bot:test`
+* viewing a sample of the data with `bundle exec openc_bot rake bot:spotcheck`
+Take a look at the bot code created at
+`your_bot_name/lib/your_bot_name.rb` and read the comments there to
+start writing your own bot.  Look at the example bots in the
+`examples/` folder for inspiration, including how to scrape from a
+website, and how to use "incrementers" to help with resumable,
+incremental scrapes (see below for more).
+These bots are all runnable; you should be able to `cd` to their
+directory, run `bundle install`, and then `bundle exec openc_bot rake
+bot:run`
+You can write bots for any schemas we have defined
+- see [SCHEMAS.md](./doc/SCHEMAS.md) for currently supported schemas.
+When you are happy that your bot is finished, please update its
+`README.md`, change the `enabled` flag in `config.yml` to be `true`,
+and email us.
+Please note that dates are a bit complicated, so we ask you to read
+the bit about dates below carefully.
+## About fetching and transforming data
+As you'll see in the sample bot, bots have separate steps to fetch
+data (the `fetch_all_records` method) and to transform it to a format
+suitable for OpenCorporates (the `to_pipeline` method).
+It is useful to have separate *fetch* and *export* phase for a couple
+of reasons:
+* For very large source datasets, it can take months to complete a
+  scrape. It is then useful to verify the data quality before
+  ingesting it in OpenCorporates.
+* Often, datasets may include a load of potentially interesting data
+  which OpenCorporates doesn't yet support.  It's worth storing this
+  data in an intermediate format, to save having to scrape it again in
+  the future. Please save anything like that and make a note of it in
+  your `README.md`.
+For more complicated scrapers, you may wish to do things more manually
+-- see [README-complex.md](./doc/README-complex.md) for more info.
+# A few words about dates
+There are three kinds of dates that OpenCorporates deals with:
+1. The date on which an observation was true: the `sample_date`. This
+is the date of a bot run, or a reporting date given in the source
+document. Every observation **must have a sample date**.
+2. A `start_date` and/or `end_date` defined explicitly in the source
+document
+3. A `start_date` or `end_date` that has not been provided by the
+source, but which OpenCorporates can infer from one or more sample
+dates. *In this case, you just supply a sample_date, and we do the
+rest*
+All dates should be in ISO8601 format.
+## A few more words about dates
+One of the important parts of the data format expected by
+OpenCorporates are the dates a statement is known to be true.
+All statements can be considered to be true between a start date and
+an end date. Sources that make explicit statements like this are great
+- but they're rare. For sources that don't explicitly define start and
+end dates for statements, it is down to OpenCorporates to compute
+these based on the bot's run schedule, and sample dates in the source
+data.
+Imagine you are interested in mining licenses in Liliput and
+Brobdingnag, and you want to provide this data to OpenCorporates. You
+find a website that lists mining licenses for these jurisdictions, so
+you write a bot that can submit each license.
+You find that Liliputian licenses have a definied start date and a
+definied end date, which mean you can explicitly say "this license is
+valid between 1 June 2012 and 31 Aug 2013" for a particular license.
+In this case, you would submit the data with a `start_date` of
+`2012-06-01` and an `end_date` of `2013-08-31`; and a
+`start_date_type` of `=` and an `end_date_type` of `=`. You would
+also submit a `sample_date` for that document, which is the date on
+which the license was known to be current (often today's date, but
+sometimes the reporting date given in the source).
+However, you find that Brobdingnagian licenses only tell you currently
+issued licenses. As a bot writer, all you can say of a particular
+license is "I saw this license when we ran the bot on 15 January
+2012". In this case, you would leave `start_date` and `end_date`
+blank, and submit a `sample_date` of `2012-01-15` instead.
+If you subsequently see the license on 15 February, you'd submit
+exactly the same data with a new `sample_date`.
+A bot is expected to be run periodically, at intervals relevant to its
+source. For example, a bot that scrapes data which changes monthly should
+scrape at least monthly. You should indicate this in the bot's
+`config.yml` file.
+This means OpenCorporates can infer, based on the running schedule of
+the bot, and the `sample_date`s of its data, the dates between which a
+license was valid (in this case, between 15 January and 15 February).
+Hence the above.
+# Speeding up your tests
+When writing scrapers, it's common to find yourself repeatedly
+scraping data from a source as you iteratively improve your code. It
+can be useful to use a caching proxy on your development machine to
+speed up this cycle.
+If you run `bundle exec openc_bot rake bot:run -- --test`, then your
+`fetch_records` method will receive an option `test_mode`; you can use
+this to turn proxying on or off.  Here's how you can set a proxy using
+the `mechanize` library; if you want to use different http client
+libraries, refer to their documentation regarding how to set a proxy.
+    agent = Mechanize.new
+    if opts[:test_mode]
+      # this requires you to have a working proxy set up -- see
+      # README.md for notes. It can speed up development considerably.
+      agent.set_proxy 'localhost', 8123
+    end
+    agent.get("http://www.foo.com") # will get it from local cache the second time
+To make this work, you will also want to set up a caching proxy
+listening on `localhost:8123`.  One such lightweight proxy is
+[polipo](http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/),
+which is available packaged for various platforms.  The following
+options in the config work for us:
+    cacheIsShared = false
+    disableIndexing = false
+    disableServersList = false
+    relaxTransparency = yes
+    dontTrustVaryETag = yes
+    proxyOffline = no
+# Targetting specific records
+If you define an (optional) `fetch_specific_records` method in your
+bot, then you can specify particular records you wish to be
+fetched, thus:
+    bundle exec openc_bot rake bot:run -- --identifier "Foo Corp"
+You can also target specific records to export with:
+    bundle exec openc_bot rake bot:export -- --identifier "Foo Corp"
+# Incremental, resumable searches
+It's often necessary to do incremental searches or scrapes to get a
+full set of data. For example, you may know that all the records exist
+at urls like http://foo.com/?page=1, http://foo.com/?page=2, etc.
+Another common use case is where you can only access records with a
+search. In these cases, there's no alternative except to search for
+all the possible permutations of the letters A-Z and numbers 0-9 (in
+the case of ASCII-searchable databases).
+In the latter case, this is 46656 different possible
+permutations. This will take a long time to scrape. If for some reason
+the scraper gets interrupted, you don't want to have to start again.
+We provide some convenience iterators, which save their current state,
+and restart unless told otherwise. They are probably not worth using for
+small scrapes (e.g. ones that take 10 mins) as they add to the complexity
+of your code; however, they are invaluable for large scrapes that may well
+get interrupted.
+    # currently provides a NumericIncrementer and an AsciiIncrementer:
+    require 'openc_bot/incrementers'
+    def fetch_all_records(opts={})
+        counter = NumericIncrementer.new(
+          :my_incrementer,
+          opts.merge(
+              :start_val => 0,
+              :end_val => 20))
+        # yield records one at a time, resuming by default
+        counter.resumable.each do |num|
+          url = "http://assets.opencorporates.com/test_bot_page_#{num}.html"
+          yield record_from_url(url)
+        end
+    end
+The above code would resume an incremental search automatically. To
+reset, run the bot thus:
+    bundle exec openc_bot rake bot:run -- --reset
+When debugging, it is useful to test out only a few iterations at a time. To do this:
+    bundle exec openc_bot rake bot:run -- --max-iterations=3
+This will restrict all iterators to a maximum of three iterations.
+There's also an incrementer which you can manually fill with records
+(arbitrary hashes), thus:
+    incrementer =  OpencBot::ManualIncrementer.new(
+        :my_incrementer,
+        opts.merge(:fields => [:num]))
+    (0..10).each do |num|
+        incrementer.add_row({'num' => num})
+    end
+    # now increment over its values, resuming where we left off last time if interrupted
+    incrementer.resumable.each do |item|
+      doc = agent.get("http://assets.opencorporates.com/document_number#{item["num"]}"
+    end
+ManualIncrementers also have a persisted field named `populated`,
+which you can use to skip expensive record-filling if it's already
+been done:
+    if !incrementer.populated
+        (0..10).each do |num|
+            incrementer.add_row({'num' => num})
+        end
+    end
+    incrementer.populated = true
+There are examples of how this can work in
+`examples/bot_with_simple_iterator`.

data/Rakefile ADDED Viewed

@@ -0,0 +1,14 @@
+require "bundler/gem_tasks"
+# load 'lib/tasks/openc_bot.rake'
+# require 'lib/tasks'
+require 'openc_bot/tasks'
+$LOAD_PATH.unshift File.dirname(__FILE__) + '/../../lib'
+# require 'resque/tasks'
+Dir.glob('lib/tasks/*.rake').each { |r| import r }
+require 'rspec/core/rake_task'
+task :default => :spec
+RSpec::Core::RakeTask.new

data/bin/openc_bot ADDED Viewed

@@ -0,0 +1,13 @@
+gem_dir = File.expand_path("..",File.dirname(__FILE__))
+$LOAD_PATH.unshift gem_dir# Look in gem directory for resources first.
+exec_type = ARGV[0]
+if exec_type == 'rake' then
+  require 'rake'
+  require 'pp'
+  pwd=Dir.pwd
+  Dir.chdir(gem_dir) # We'll load rakefile from the gem's dir.
+  Rake.application.init
+  Rake.application.load_rakefile
+  Dir.chdir(pwd) # Revert to original pwd for any path args passed to task.
+  Rake.application.invoke_task(ARGV[1])
+end

data/create_bot.sh ADDED Viewed

@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+# Add the openc_bot to the Gemfile:
+if [ ! -f Gemfile ]; then
+  echo "source 'https://rubygems.org'" >> Gemfile
+  echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
+fi
+echo "/db/*" >> .gitignore
+echo "/data/*" >> .gitignore
+echo "/tmp/*" >> .gitignore
+echo "/pids/*" >> .gitignore
+echo "!.gitkeep" >> .gitignore
+mkdir -p db
+mkdir -p data
+mkdir -p tmp
+mkdir -p pids
+touch db/.gitkeep
+touch data/.gitkeep
+touch tmp/.gitkeep
+touch pids/.gitkeep
+bundle install
+# create the bot
+bundle exec openc_bot rake bot:create
+bundle install

data/create_company_bot.sh ADDED Viewed

@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+# Add the openc_bot to the Gemfile:
+if [ ! -f Gemfile ]; then
+  echo "source 'https://rubygems.org'" >> Gemfile
+  echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git', :branch => 'company_fetcher_bot'" >> Gemfile
+fi
+echo "/db" >> .gitignore
+echo "/data" >> .gitignore
+echo "/tmp" >> .gitignore
+bundle install
+# create the bot
+bundle exec openc_bot rake bot:create_company_bot
+bundle install

data/create_simple_licence_bot.sh ADDED Viewed

@@ -0,0 +1,31 @@
+#!/bin/bash
+set -e
+# Add the openc_bot to the Gemfile:
+if [ ! -f Gemfile ]; then
+  echo "source 'https://rubygems.org'" >> Gemfile
+  echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
+  echo "gem 'mechanize'" >> Gemfile
+fi
+echo "/db/*" >> .gitignore
+echo "/data/*" >> .gitignore
+echo "/tmp/*" >> .gitignore
+echo "/pids/*" >> .gitignore
+echo "!.gitkeep" >> .gitignore
+mkdir -p db
+mkdir -p data
+mkdir -p tmp
+mkdir -p pids
+touch db/.gitkeep
+touch data/.gitkeep
+touch tmp/.gitkeep
+touch pids/.gitkeep
+bundle install
+# create the bot
+bundle exec openc_bot rake bot:create_simple_bot
+bundle install