RubyGems - openc_bot - Versions diffs - 0.0.11 - Mend

openc_bot 0.0.11

Files changed (85) hide show

checksums.yaml +7 -0
data/.gitignore +22 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +2 -0
data/Gemfile +8 -0
data/LICENSE.txt +22 -0
data/README.md +253 -0
data/Rakefile +14 -0
data/bin/openc_bot +13 -0
data/create_bot.sh +30 -0
data/create_company_bot.sh +16 -0
data/create_simple_licence_bot.sh +31 -0
data/db/.gitkeep +0 -0
data/examples/basic/.gitignore +3 -0
data/examples/basic/Gemfile +7 -0
data/examples/basic/config.yml +21 -0
data/examples/basic/lib/basic.rb +88 -0
data/examples/basic_with_proxy/Gemfile +7 -0
data/examples/basic_with_proxy/config.yml +21 -0
data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
data/examples/bot_with_simple_iterator/Gemfile +6 -0
data/examples/bot_with_simple_iterator/config.yml +21 -0
data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
data/examples/company_fetchers/basic.rb +49 -0
data/lib/monkey_patches/mechanize.rb +53 -0
data/lib/openc_bot.rb +89 -0
data/lib/openc_bot/bot_data_validator.rb +18 -0
data/lib/openc_bot/company_fetcher_bot.rb +40 -0
data/lib/openc_bot/exceptions.rb +17 -0
data/lib/openc_bot/helpers/_csv.rb +10 -0
data/lib/openc_bot/helpers/alpha_search.rb +73 -0
data/lib/openc_bot/helpers/dates.rb +33 -0
data/lib/openc_bot/helpers/html.rb +8 -0
data/lib/openc_bot/helpers/incremental_search.rb +106 -0
data/lib/openc_bot/helpers/register_methods.rb +205 -0
data/lib/openc_bot/helpers/text.rb +18 -0
data/lib/openc_bot/incrementers.rb +2 -0
data/lib/openc_bot/incrementers/base.rb +214 -0
data/lib/openc_bot/incrementers/common.rb +47 -0
data/lib/openc_bot/tasks.rb +385 -0
data/lib/openc_bot/templates/README.md +35 -0
data/lib/openc_bot/templates/bin/export_data +28 -0
data/lib/openc_bot/templates/bin/fetch_data +23 -0
data/lib/openc_bot/templates/bin/verify_data +1 -0
data/lib/openc_bot/templates/config.yml +21 -0
data/lib/openc_bot/templates/lib/bot.rb +43 -0
data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
data/lib/openc_bot/version.rb +3 -0
data/lib/simple_openc_bot.rb +289 -0
data/openc_bot.gemspec +35 -0
data/schemas/company-schema.json +112 -0
data/schemas/includes/address.json +23 -0
data/schemas/includes/base-statement.json +27 -0
data/schemas/includes/company.json +14 -0
data/schemas/includes/filing.json +20 -0
data/schemas/includes/license-data.json +27 -0
data/schemas/includes/officer.json +14 -0
data/schemas/includes/previous_name.json +11 -0
data/schemas/includes/share-parcel-data.json +67 -0
data/schemas/includes/share-parcel.json +60 -0
data/schemas/includes/subsidiary-relationship-data.json +52 -0
data/schemas/includes/total-shares.json +10 -0
data/schemas/licence-schema.json +21 -0
data/schemas/share-parcel-schema.json +21 -0
data/schemas/subsidiary-relationship-schema.json +19 -0
data/spec/dummy_classes/foo_bot.rb +4 -0
data/spec/lib/bot_data_validator_spec.rb +69 -0
data/spec/lib/company_fetcher_bot_spec.rb +93 -0
data/spec/lib/exceptions_spec.rb +25 -0
data/spec/lib/helpers/alpha_search_spec.rb +173 -0
data/spec/lib/helpers/dates_spec.rb +65 -0
data/spec/lib/helpers/incremental_search_spec.rb +471 -0
data/spec/lib/helpers/register_methods_spec.rb +558 -0
data/spec/lib/helpers/text_spec.rb +50 -0
data/spec/lib/openc_bot/db/.gitkeep +0 -0
data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
data/spec/lib/openc_bot_spec.rb +116 -0
data/spec/schemas/company-schema_spec.rb +676 -0
data/spec/simple_openc_bot_spec.rb +302 -0
data/spec/spec_helper.rb +19 -0
metadata +300 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 2c2492325f145ede40b77b666ab93b99c47ba314
+  data.tar.gz: 7aa0dd5faf896d3a7e6a2217092ced8227084e8b
+SHA512:
+  metadata.gz: 9c2709f8c3cb91d06d6e356809e8adc2e16dd5499b5c85e217fe6637c2c4045b6e6add769ffded9844227ef16a04ea387f0d252a4d4ffeb80fd6cea5876f4faf
+  data.tar.gz: 94d5a2d6222a04164ee5f93cb266129790651f3aae587faa20c52af81d4fbdbb15d899b11eb42a2dd2f06aa075d80bf282e1a503d39b3400b07f9da016061271

data/.gitignore ADDED Viewed

@@ -0,0 +1,22 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+.DS_Store
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+*~
+db/*
+**/db/*
+!.gitkeep

data/.travis.yml ADDED Viewed

@@ -0,0 +1,8 @@
+language: ruby
+rvm:
+  - "1.9.2"
+  - "1.9.3"
+  - "2.1.0"
+  # - jruby-18mode # JRuby in 1.8 mode
+  # - jruby-19mode # JRuby in 1.9 mode
+  # - rbx

data/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # 0.0.1
2	+ * Initial commit

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+source 'https://rubygems.org'
+gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
+gem "pry", :group => [:development,:test]
+# Specify your gem's dependencies in openc_bot.gemspec
+gemspec
+# we need to do pull request and bump version
+# gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Chris Taggart
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,253 @@
+# OpencBot
+## Overview
+This is a gem to allow bots to be written to fetch and format data
+that can be easily imported into OpenCorporates, the largest openly
+licensed database of companies in the world.
+To start writing a new bot, run the following to create a skeleton bot:
+```bash
+mkdir your_bot_name
+cd your_bot_name
+curl -s https://raw.githubusercontent.com/openc/openc_bot/master/create_simple_licence_bot.sh | bash
+```
+The default bot doesn't scrape, it just outputs some dummy data. You can try:
+* running the scrape with `bundle exec openc_bot rake bot:run`
+* testing the validity of the data it will output with
+  `bundle exec openc_bot rake bot:test`
+* viewing a sample of the data with `bundle exec openc_bot rake bot:spotcheck`
+Take a look at the bot code created at
+`your_bot_name/lib/your_bot_name.rb` and read the comments there to
+start writing your own bot.  Look at the example bots in the
+`examples/` folder for inspiration, including how to scrape from a
+website, and how to use "incrementers" to help with resumable,
+incremental scrapes (see below for more).
+These bots are all runnable; you should be able to `cd` to their
+directory, run `bundle install`, and then `bundle exec openc_bot rake
+bot:run`
+You can write bots for any schemas we have defined
+- see [SCHEMAS.md](./doc/SCHEMAS.md) for currently supported schemas.
+When you are happy that your bot is finished, please update its
+`README.md`, change the `enabled` flag in `config.yml` to be `true`,
+and email us.
+Please note that dates are a bit complicated, so we ask you to read
+the bit about dates below carefully.
+## About fetching and transforming data
+As you'll see in the sample bot, bots have separate steps to fetch
+data (the `fetch_all_records` method) and to transform it to a format
+suitable for OpenCorporates (the `to_pipeline` method).
+It is useful to have separate *fetch* and *export* phase for a couple
+of reasons:
+* For very large source datasets, it can take months to complete a
+  scrape. It is then useful to verify the data quality before
+  ingesting it in OpenCorporates.
+* Often, datasets may include a load of potentially interesting data
+  which OpenCorporates doesn't yet support.  It's worth storing this
+  data in an intermediate format, to save having to scrape it again in
+  the future. Please save anything like that and make a note of it in
+  your `README.md`.
+For more complicated scrapers, you may wish to do things more manually
+-- see [README-complex.md](./doc/README-complex.md) for more info.
+# A few words about dates
+There are three kinds of dates that OpenCorporates deals with:
+1. The date on which an observation was true: the `sample_date`. This
+is the date of a bot run, or a reporting date given in the source
+document. Every observation **must have a sample date**.
+2. A `start_date` and/or `end_date` defined explicitly in the source
+document
+3. A `start_date` or `end_date` that has not been provided by the
+source, but which OpenCorporates can infer from one or more sample
+dates. *In this case, you just supply a sample_date, and we do the
+rest*
+All dates should be in ISO8601 format.
+## A few more words about dates
+One of the important parts of the data format expected by
+OpenCorporates are the dates a statement is known to be true.
+All statements can be considered to be true between a start date and
+an end date. Sources that make explicit statements like this are great
+- but they're rare. For sources that don't explicitly define start and
+end dates for statements, it is down to OpenCorporates to compute
+these based on the bot's run schedule, and sample dates in the source
+data.
+Imagine you are interested in mining licenses in Liliput and
+Brobdingnag, and you want to provide this data to OpenCorporates. You
+find a website that lists mining licenses for these jurisdictions, so
+you write a bot that can submit each license.
+You find that Liliputian licenses have a definied start date and a
+definied end date, which mean you can explicitly say "this license is
+valid between 1 June 2012 and 31 Aug 2013" for a particular license.
+In this case, you would submit the data with a `start_date` of
+`2012-06-01` and an `end_date` of `2013-08-31`; and a
+`start_date_type` of `=` and an `end_date_type` of `=`. You would
+also submit a `sample_date` for that document, which is the date on
+which the license was known to be current (often today's date, but
+sometimes the reporting date given in the source).
+However, you find that Brobdingnagian licenses only tell you currently
+issued licenses. As a bot writer, all you can say of a particular
+license is "I saw this license when we ran the bot on 15 January
+2012". In this case, you would leave `start_date` and `end_date`
+blank, and submit a `sample_date` of `2012-01-15` instead.
+If you subsequently see the license on 15 February, you'd submit
+exactly the same data with a new `sample_date`.
+A bot is expected to be run periodically, at intervals relevant to its
+source. For example, a bot that scrapes data which changes monthly should
+scrape at least monthly. You should indicate this in the bot's
+`config.yml` file.
+This means OpenCorporates can infer, based on the running schedule of
+the bot, and the `sample_date`s of its data, the dates between which a
+license was valid (in this case, between 15 January and 15 February).
+Hence the above.
+# Speeding up your tests
+When writing scrapers, it's common to find yourself repeatedly
+scraping data from a source as you iteratively improve your code. It
+can be useful to use a caching proxy on your development machine to
+speed up this cycle.
+If you run `bundle exec openc_bot rake bot:run -- --test`, then your
+`fetch_records` method will receive an option `test_mode`; you can use
+this to turn proxying on or off.  Here's how you can set a proxy using
+the `mechanize` library; if you want to use different http client
+libraries, refer to their documentation regarding how to set a proxy.
+    agent = Mechanize.new
+    if opts[:test_mode]
+      # this requires you to have a working proxy set up -- see
+      # README.md for notes. It can speed up development considerably.
+      agent.set_proxy 'localhost', 8123
+    end
+    agent.get("http://www.foo.com") # will get it from local cache the second time
+To make this work, you will also want to set up a caching proxy
+listening on `localhost:8123`.  One such lightweight proxy is
+[polipo](http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/),
+which is available packaged for various platforms.  The following
+options in the config work for us:
+    cacheIsShared = false
+    disableIndexing = false
+    disableServersList = false
+    relaxTransparency = yes
+    dontTrustVaryETag = yes
+    proxyOffline = no
+# Targetting specific records
+If you define an (optional) `fetch_specific_records` method in your
+bot, then you can specify particular records you wish to be
+fetched, thus:
+    bundle exec openc_bot rake bot:run -- --identifier "Foo Corp"
+You can also target specific records to export with:
+    bundle exec openc_bot rake bot:export -- --identifier "Foo Corp"
+# Incremental, resumable searches
+It's often necessary to do incremental searches or scrapes to get a
+full set of data. For example, you may know that all the records exist
+at urls like http://foo.com/?page=1, http://foo.com/?page=2, etc.
+Another common use case is where you can only access records with a
+search. In these cases, there's no alternative except to search for
+all the possible permutations of the letters A-Z and numbers 0-9 (in
+the case of ASCII-searchable databases).
+In the latter case, this is 46656 different possible
+permutations. This will take a long time to scrape. If for some reason
+the scraper gets interrupted, you don't want to have to start again.
+We provide some convenience iterators, which save their current state,
+and restart unless told otherwise. They are probably not worth using for
+small scrapes (e.g. ones that take 10 mins) as they add to the complexity
+of your code; however, they are invaluable for large scrapes that may well
+get interrupted.
+    # currently provides a NumericIncrementer and an AsciiIncrementer:
+    require 'openc_bot/incrementers'
+    def fetch_all_records(opts={})
+        counter = NumericIncrementer.new(
+          :my_incrementer,
+          opts.merge(
+              :start_val => 0,
+              :end_val => 20))
+        # yield records one at a time, resuming by default
+        counter.resumable.each do |num|
+          url = "http://assets.opencorporates.com/test_bot_page_#{num}.html"
+          yield record_from_url(url)
+        end
+    end
+The above code would resume an incremental search automatically. To
+reset, run the bot thus:
+    bundle exec openc_bot rake bot:run -- --reset
+When debugging, it is useful to test out only a few iterations at a time. To do this:
+    bundle exec openc_bot rake bot:run -- --max-iterations=3
+This will restrict all iterators to a maximum of three iterations.
+There's also an incrementer which you can manually fill with records
+(arbitrary hashes), thus:
+    incrementer =  OpencBot::ManualIncrementer.new(
+        :my_incrementer,
+        opts.merge(:fields => [:num]))
+    (0..10).each do |num|
+        incrementer.add_row({'num' => num})
+    end
+    # now increment over its values, resuming where we left off last time if interrupted
+    incrementer.resumable.each do |item|
+      doc = agent.get("http://assets.opencorporates.com/document_number#{item["num"]}"
+    end
+ManualIncrementers also have a persisted field named `populated`,
+which you can use to skip expensive record-filling if it's already
+been done:
+    if !incrementer.populated
+        (0..10).each do |num|
+            incrementer.add_row({'num' => num})
+        end
+    end
+    incrementer.populated = true
+There are examples of how this can work in
+`examples/bot_with_simple_iterator`.

data/Rakefile ADDED Viewed

@@ -0,0 +1,14 @@
+require "bundler/gem_tasks"
+# load 'lib/tasks/openc_bot.rake'
+# require 'lib/tasks'
+require 'openc_bot/tasks'
+$LOAD_PATH.unshift File.dirname(__FILE__) + '/../../lib'
+# require 'resque/tasks'
+Dir.glob('lib/tasks/*.rake').each { |r| import r }
+require 'rspec/core/rake_task'
+task :default => :spec
+RSpec::Core::RakeTask.new

data/bin/openc_bot ADDED Viewed

@@ -0,0 +1,13 @@
+gem_dir = File.expand_path("..",File.dirname(__FILE__))
+$LOAD_PATH.unshift gem_dir# Look in gem directory for resources first.
+exec_type = ARGV[0]
+if exec_type == 'rake' then
+  require 'rake'
+  require 'pp'
+  pwd=Dir.pwd
+  Dir.chdir(gem_dir) # We'll load rakefile from the gem's dir.
+  Rake.application.init
+  Rake.application.load_rakefile
+  Dir.chdir(pwd) # Revert to original pwd for any path args passed to task.
+  Rake.application.invoke_task(ARGV[1])
+end

data/create_bot.sh ADDED Viewed

@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+# Add the openc_bot to the Gemfile:
+if [ ! -f Gemfile ]; then
+  echo "source 'https://rubygems.org'" >> Gemfile
+  echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
+fi
+echo "/db/*" >> .gitignore
+echo "/data/*" >> .gitignore
+echo "/tmp/*" >> .gitignore
+echo "/pids/*" >> .gitignore
+echo "!.gitkeep" >> .gitignore
+mkdir -p db
+mkdir -p data
+mkdir -p tmp
+mkdir -p pids
+touch db/.gitkeep
+touch data/.gitkeep
+touch tmp/.gitkeep
+touch pids/.gitkeep
+bundle install
+# create the bot
+bundle exec openc_bot rake bot:create
+bundle install

data/create_company_bot.sh ADDED Viewed

@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+# Add the openc_bot to the Gemfile:
+if [ ! -f Gemfile ]; then
+  echo "source 'https://rubygems.org'" >> Gemfile
+  echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git', :branch => 'company_fetcher_bot'" >> Gemfile
+fi
+echo "/db" >> .gitignore
+echo "/data" >> .gitignore
+echo "/tmp" >> .gitignore
+bundle install
+# create the bot
+bundle exec openc_bot rake bot:create_company_bot
+bundle install

data/create_simple_licence_bot.sh ADDED Viewed

@@ -0,0 +1,31 @@
+#!/bin/bash
+set -e
+# Add the openc_bot to the Gemfile:
+if [ ! -f Gemfile ]; then
+  echo "source 'https://rubygems.org'" >> Gemfile
+  echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
+  echo "gem 'mechanize'" >> Gemfile
+fi
+echo "/db/*" >> .gitignore
+echo "/data/*" >> .gitignore
+echo "/tmp/*" >> .gitignore
+echo "/pids/*" >> .gitignore
+echo "!.gitkeep" >> .gitignore
+mkdir -p db
+mkdir -p data
+mkdir -p tmp
+mkdir -p pids
+touch db/.gitkeep
+touch data/.gitkeep
+touch tmp/.gitkeep
+touch pids/.gitkeep
+bundle install
+# create the bot
+bundle exec openc_bot rake bot:create_simple_bot
+bundle install