openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2c2492325f145ede40b77b666ab93b99c47ba314
4
+ data.tar.gz: 7aa0dd5faf896d3a7e6a2217092ced8227084e8b
5
+ SHA512:
6
+ metadata.gz: 9c2709f8c3cb91d06d6e356809e8adc2e16dd5499b5c85e217fe6637c2c4045b6e6add769ffded9844227ef16a04ea387f0d252a4d4ffeb80fd6cea5876f4faf
7
+ data.tar.gz: 94d5a2d6222a04164ee5f93cb266129790651f3aae587faa20c52af81d4fbdbb15d899b11eb42a2dd2f06aa075d80bf282e1a503d39b3400b07f9da016061271
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .DS_Store
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ *~
20
+ db/*
21
+ **/db/*
22
+ !.gitkeep
data/.travis.yml ADDED
@@ -0,0 +1,8 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.2"
4
+ - "1.9.3"
5
+ - "2.1.0"
6
+ # - jruby-18mode # JRuby in 1.8 mode
7
+ # - jruby-19mode # JRuby in 1.9 mode
8
+ # - rbx
data/CHANGELOG.md ADDED
@@ -0,0 +1,2 @@
1
+ # 0.0.1
2
+ * Initial commit
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+ gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
3
+ gem "pry", :group => [:development,:test]
4
+ # Specify your gem's dependencies in openc_bot.gemspec
5
+ gemspec
6
+
7
+ # we need to do pull request and bump version
8
+ # gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Chris Taggart
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,253 @@
1
+ # OpencBot
2
+
3
+ ## Overview
4
+
5
+ This is a gem to allow bots to be written to fetch and format data
6
+ that can be easily imported into OpenCorporates, the largest openly
7
+ licensed database of companies in the world.
8
+
9
+ To start writing a new bot, run the following to create a skeleton bot:
10
+
11
+ ```bash
12
+ mkdir your_bot_name
13
+ cd your_bot_name
14
+ curl -s https://raw.githubusercontent.com/openc/openc_bot/master/create_simple_licence_bot.sh | bash
15
+ ```
16
+
17
+ The default bot doesn't scrape, it just outputs some dummy data. You can try:
18
+
19
+ * running the scrape with `bundle exec openc_bot rake bot:run`
20
+ * testing the validity of the data it will output with
21
+ `bundle exec openc_bot rake bot:test`
22
+ * viewing a sample of the data with `bundle exec openc_bot rake bot:spotcheck`
23
+
24
+ Take a look at the bot code created at
25
+ `your_bot_name/lib/your_bot_name.rb` and read the comments there to
26
+ start writing your own bot. Look at the example bots in the
27
+ `examples/` folder for inspiration, including how to scrape from a
28
+ website, and how to use "incrementers" to help with resumable,
29
+ incremental scrapes (see below for more).
30
+
31
+ These bots are all runnable; you should be able to `cd` to their
32
+ directory, run `bundle install`, and then `bundle exec openc_bot rake
33
+ bot:run`
34
+
35
+ You can write bots for any schemas we have defined
36
+ - see [SCHEMAS.md](./doc/SCHEMAS.md) for currently supported schemas.
37
+
38
+ When you are happy that your bot is finished, please update its
39
+ `README.md`, change the `enabled` flag in `config.yml` to be `true`,
40
+ and email us.
41
+
42
+ Please note that dates are a bit complicated, so we ask you to read
43
+ the bit about dates below carefully.
44
+
45
+ ## About fetching and transforming data
46
+
47
+ As you'll see in the sample bot, bots have separate steps to fetch
48
+ data (the `fetch_all_records` method) and to transform it to a format
49
+ suitable for OpenCorporates (the `to_pipeline` method).
50
+
51
+ It is useful to have separate *fetch* and *export* phase for a couple
52
+ of reasons:
53
+
54
+ * For very large source datasets, it can take months to complete a
55
+ scrape. It is then useful to verify the data quality before
56
+ ingesting it in OpenCorporates.
57
+ * Often, datasets may include a load of potentially interesting data
58
+ which OpenCorporates doesn't yet support. It's worth storing this
59
+ data in an intermediate format, to save having to scrape it again in
60
+ the future. Please save anything like that and make a note of it in
61
+ your `README.md`.
62
+
63
+ For more complicated scrapers, you may wish to do things more manually
64
+ -- see [README-complex.md](./doc/README-complex.md) for more info.
65
+
66
+ # A few words about dates
67
+
68
+ There are three kinds of dates that OpenCorporates deals with:
69
+
70
+ 1. The date on which an observation was true: the `sample_date`. This
71
+ is the date of a bot run, or a reporting date given in the source
72
+ document. Every observation **must have a sample date**.
73
+ 2. A `start_date` and/or `end_date` defined explicitly in the source
74
+ document
75
+ 3. A `start_date` or `end_date` that has not been provided by the
76
+ source, but which OpenCorporates can infer from one or more sample
77
+ dates. *In this case, you just supply a sample_date, and we do the
78
+ rest*
79
+
80
+ All dates should be in ISO8601 format.
81
+
82
+ ## A few more words about dates
83
+
84
+ One of the important parts of the data format expected by
85
+ OpenCorporates are the dates a statement is known to be true.
86
+
87
+ All statements can be considered to be true between a start date and
88
+ an end date. Sources that make explicit statements like this are great
89
+ - but they're rare. For sources that don't explicitly define start and
90
+ end dates for statements, it is down to OpenCorporates to compute
91
+ these based on the bot's run schedule, and sample dates in the source
92
+ data.
93
+
94
+ Imagine you are interested in mining licenses in Liliput and
95
+ Brobdingnag, and you want to provide this data to OpenCorporates. You
96
+ find a website that lists mining licenses for these jurisdictions, so
97
+ you write a bot that can submit each license.
98
+
99
+ You find that Liliputian licenses have a definied start date and a
100
+ definied end date, which mean you can explicitly say "this license is
101
+ valid between 1 June 2012 and 31 Aug 2013" for a particular license.
102
+
103
+ In this case, you would submit the data with a `start_date` of
104
+ `2012-06-01` and an `end_date` of `2013-08-31`; and a
105
+ `start_date_type` of `=` and an `end_date_type` of `=`. You would
106
+ also submit a `sample_date` for that document, which is the date on
107
+ which the license was known to be current (often today's date, but
108
+ sometimes the reporting date given in the source).
109
+
110
+ However, you find that Brobdingnagian licenses only tell you currently
111
+ issued licenses. As a bot writer, all you can say of a particular
112
+ license is "I saw this license when we ran the bot on 15 January
113
+ 2012". In this case, you would leave `start_date` and `end_date`
114
+ blank, and submit a `sample_date` of `2012-01-15` instead.
115
+
116
+ If you subsequently see the license on 15 February, you'd submit
117
+ exactly the same data with a new `sample_date`.
118
+
119
+ A bot is expected to be run periodically, at intervals relevant to its
120
+ source. For example, a bot that scrapes data which changes monthly should
121
+ scrape at least monthly. You should indicate this in the bot's
122
+ `config.yml` file.
123
+
124
+ This means OpenCorporates can infer, based on the running schedule of
125
+ the bot, and the `sample_date`s of its data, the dates between which a
126
+ license was valid (in this case, between 15 January and 15 February).
127
+
128
+ Hence the above.
129
+
130
+ # Speeding up your tests
131
+
132
+ When writing scrapers, it's common to find yourself repeatedly
133
+ scraping data from a source as you iteratively improve your code. It
134
+ can be useful to use a caching proxy on your development machine to
135
+ speed up this cycle.
136
+
137
+ If you run `bundle exec openc_bot rake bot:run -- --test`, then your
138
+ `fetch_records` method will receive an option `test_mode`; you can use
139
+ this to turn proxying on or off. Here's how you can set a proxy using
140
+ the `mechanize` library; if you want to use different http client
141
+ libraries, refer to their documentation regarding how to set a proxy.
142
+
143
+ agent = Mechanize.new
144
+ if opts[:test_mode]
145
+ # this requires you to have a working proxy set up -- see
146
+ # README.md for notes. It can speed up development considerably.
147
+ agent.set_proxy 'localhost', 8123
148
+ end
149
+ agent.get("http://www.foo.com") # will get it from local cache the second time
150
+
151
+ To make this work, you will also want to set up a caching proxy
152
+ listening on `localhost:8123`. One such lightweight proxy is
153
+ [polipo](http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/),
154
+ which is available packaged for various platforms. The following
155
+ options in the config work for us:
156
+
157
+ cacheIsShared = false
158
+ disableIndexing = false
159
+ disableServersList = false
160
+ relaxTransparency = yes
161
+ dontTrustVaryETag = yes
162
+ proxyOffline = no
163
+
164
+ # Targetting specific records
165
+
166
+ If you define an (optional) `fetch_specific_records` method in your
167
+ bot, then you can specify particular records you wish to be
168
+ fetched, thus:
169
+
170
+ bundle exec openc_bot rake bot:run -- --identifier "Foo Corp"
171
+
172
+ You can also target specific records to export with:
173
+
174
+ bundle exec openc_bot rake bot:export -- --identifier "Foo Corp"
175
+
176
+ # Incremental, resumable searches
177
+
178
+ It's often necessary to do incremental searches or scrapes to get a
179
+ full set of data. For example, you may know that all the records exist
180
+ at urls like http://foo.com/?page=1, http://foo.com/?page=2, etc.
181
+
182
+ Another common use case is where you can only access records with a
183
+ search. In these cases, there's no alternative except to search for
184
+ all the possible permutations of the letters A-Z and numbers 0-9 (in
185
+ the case of ASCII-searchable databases).
186
+
187
+ In the latter case, this is 46656 different possible
188
+ permutations. This will take a long time to scrape. If for some reason
189
+ the scraper gets interrupted, you don't want to have to start again.
190
+
191
+ We provide some convenience iterators, which save their current state,
192
+ and restart unless told otherwise. They are probably not worth using for
193
+ small scrapes (e.g. ones that take 10 mins) as they add to the complexity
194
+ of your code; however, they are invaluable for large scrapes that may well
195
+ get interrupted.
196
+
197
+ # currently provides a NumericIncrementer and an AsciiIncrementer:
198
+ require 'openc_bot/incrementers'
199
+
200
+ def fetch_all_records(opts={})
201
+ counter = NumericIncrementer.new(
202
+ :my_incrementer,
203
+ opts.merge(
204
+ :start_val => 0,
205
+ :end_val => 20))
206
+
207
+ # yield records one at a time, resuming by default
208
+ counter.resumable.each do |num|
209
+ url = "http://assets.opencorporates.com/test_bot_page_#{num}.html"
210
+ yield record_from_url(url)
211
+ end
212
+ end
213
+
214
+ The above code would resume an incremental search automatically. To
215
+ reset, run the bot thus:
216
+
217
+ bundle exec openc_bot rake bot:run -- --reset
218
+
219
+ When debugging, it is useful to test out only a few iterations at a time. To do this:
220
+
221
+ bundle exec openc_bot rake bot:run -- --max-iterations=3
222
+
223
+ This will restrict all iterators to a maximum of three iterations.
224
+
225
+ There's also an incrementer which you can manually fill with records
226
+ (arbitrary hashes), thus:
227
+
228
+ incrementer = OpencBot::ManualIncrementer.new(
229
+ :my_incrementer,
230
+ opts.merge(:fields => [:num]))
231
+
232
+ (0..10).each do |num|
233
+ incrementer.add_row({'num' => num})
234
+ end
235
+
236
+ # now increment over its values, resuming where we left off last time if interrupted
237
+ incrementer.resumable.each do |item|
238
+ doc = agent.get("http://assets.opencorporates.com/document_number#{item["num"]}"
239
+ end
240
+
241
+ ManualIncrementers also have a persisted field named `populated`,
242
+ which you can use to skip expensive record-filling if it's already
243
+ been done:
244
+
245
+ if !incrementer.populated
246
+ (0..10).each do |num|
247
+ incrementer.add_row({'num' => num})
248
+ end
249
+ end
250
+ incrementer.populated = true
251
+
252
+ There are examples of how this can work in
253
+ `examples/bot_with_simple_iterator`.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require "bundler/gem_tasks"
2
+ # load 'lib/tasks/openc_bot.rake'
3
+ # require 'lib/tasks'
4
+ require 'openc_bot/tasks'
5
+
6
+
7
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../../lib'
8
+ # require 'resque/tasks'
9
+
10
+ Dir.glob('lib/tasks/*.rake').each { |r| import r }
11
+
12
+ require 'rspec/core/rake_task'
13
+ task :default => :spec
14
+ RSpec::Core::RakeTask.new
data/bin/openc_bot ADDED
@@ -0,0 +1,13 @@
1
+ gem_dir = File.expand_path("..",File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift gem_dir# Look in gem directory for resources first.
3
+ exec_type = ARGV[0]
4
+ if exec_type == 'rake' then
5
+ require 'rake'
6
+ require 'pp'
7
+ pwd=Dir.pwd
8
+ Dir.chdir(gem_dir) # We'll load rakefile from the gem's dir.
9
+ Rake.application.init
10
+ Rake.application.load_rakefile
11
+ Dir.chdir(pwd) # Revert to original pwd for any path args passed to task.
12
+ Rake.application.invoke_task(ARGV[1])
13
+ end
data/create_bot.sh ADDED
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Add the openc_bot to the Gemfile:
6
+ if [ ! -f Gemfile ]; then
7
+ echo "source 'https://rubygems.org'" >> Gemfile
8
+ echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
9
+ fi
10
+
11
+ echo "/db/*" >> .gitignore
12
+ echo "/data/*" >> .gitignore
13
+ echo "/tmp/*" >> .gitignore
14
+ echo "/pids/*" >> .gitignore
15
+ echo "!.gitkeep" >> .gitignore
16
+
17
+ mkdir -p db
18
+ mkdir -p data
19
+ mkdir -p tmp
20
+ mkdir -p pids
21
+
22
+ touch db/.gitkeep
23
+ touch data/.gitkeep
24
+ touch tmp/.gitkeep
25
+ touch pids/.gitkeep
26
+
27
+ bundle install
28
+ # create the bot
29
+ bundle exec openc_bot rake bot:create
30
+ bundle install
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Add the openc_bot to the Gemfile:
6
+ if [ ! -f Gemfile ]; then
7
+ echo "source 'https://rubygems.org'" >> Gemfile
8
+ echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git', :branch => 'company_fetcher_bot'" >> Gemfile
9
+ fi
10
+ echo "/db" >> .gitignore
11
+ echo "/data" >> .gitignore
12
+ echo "/tmp" >> .gitignore
13
+ bundle install
14
+ # create the bot
15
+ bundle exec openc_bot rake bot:create_company_bot
16
+ bundle install
@@ -0,0 +1,31 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Add the openc_bot to the Gemfile:
6
+ if [ ! -f Gemfile ]; then
7
+ echo "source 'https://rubygems.org'" >> Gemfile
8
+ echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
9
+ echo "gem 'mechanize'" >> Gemfile
10
+ fi
11
+
12
+ echo "/db/*" >> .gitignore
13
+ echo "/data/*" >> .gitignore
14
+ echo "/tmp/*" >> .gitignore
15
+ echo "/pids/*" >> .gitignore
16
+ echo "!.gitkeep" >> .gitignore
17
+
18
+ mkdir -p db
19
+ mkdir -p data
20
+ mkdir -p tmp
21
+ mkdir -p pids
22
+
23
+ touch db/.gitkeep
24
+ touch data/.gitkeep
25
+ touch tmp/.gitkeep
26
+ touch pids/.gitkeep
27
+
28
+ bundle install
29
+ # create the bot
30
+ bundle exec openc_bot rake bot:create_simple_bot
31
+ bundle install