openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2c2492325f145ede40b77b666ab93b99c47ba314
4
+ data.tar.gz: 7aa0dd5faf896d3a7e6a2217092ced8227084e8b
5
+ SHA512:
6
+ metadata.gz: 9c2709f8c3cb91d06d6e356809e8adc2e16dd5499b5c85e217fe6637c2c4045b6e6add769ffded9844227ef16a04ea387f0d252a4d4ffeb80fd6cea5876f4faf
7
+ data.tar.gz: 94d5a2d6222a04164ee5f93cb266129790651f3aae587faa20c52af81d4fbdbb15d899b11eb42a2dd2f06aa075d80bf282e1a503d39b3400b07f9da016061271
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .DS_Store
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ *~
20
+ db/*
21
+ **/db/*
22
+ !.gitkeep
data/.travis.yml ADDED
@@ -0,0 +1,8 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.2"
4
+ - "1.9.3"
5
+ - "2.1.0"
6
+ # - jruby-18mode # JRuby in 1.8 mode
7
+ # - jruby-19mode # JRuby in 1.9 mode
8
+ # - rbx
data/CHANGELOG.md ADDED
@@ -0,0 +1,2 @@
1
+ # 0.0.1
2
+ * Initial commit
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+ gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
3
+ gem "pry", :group => [:development,:test]
4
+ # Specify your gem's dependencies in openc_bot.gemspec
5
+ gemspec
6
+
7
+ # we need to do pull request and bump version
8
+ # gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Chris Taggart
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,253 @@
1
+ # OpencBot
2
+
3
+ ## Overview
4
+
5
+ This is a gem to allow bots to be written to fetch and format data
6
+ that can be easily imported into OpenCorporates, the largest openly
7
+ licensed database of companies in the world.
8
+
9
+ To start writing a new bot, run the following to create a skeleton bot:
10
+
11
+ ```bash
12
+ mkdir your_bot_name
13
+ cd your_bot_name
14
+ curl -s https://raw.githubusercontent.com/openc/openc_bot/master/create_simple_licence_bot.sh | bash
15
+ ```
16
+
17
+ The default bot doesn't scrape, it just outputs some dummy data. You can try:
18
+
19
+ * running the scrape with `bundle exec openc_bot rake bot:run`
20
+ * testing the validity of the data it will output with
21
+ `bundle exec openc_bot rake bot:test`
22
+ * viewing a sample of the data with `bundle exec openc_bot rake bot:spotcheck`
23
+
24
+ Take a look at the bot code created at
25
+ `your_bot_name/lib/your_bot_name.rb` and read the comments there to
26
+ start writing your own bot. Look at the example bots in the
27
+ `examples/` folder for inspiration, including how to scrape from a
28
+ website, and how to use "incrementers" to help with resumable,
29
+ incremental scrapes (see below for more).
30
+
31
+ These bots are all runnable; you should be able to `cd` to their
32
+ directory, run `bundle install`, and then `bundle exec openc_bot rake
33
+ bot:run`
34
+
35
+ You can write bots for any schemas we have defined
36
+ - see [SCHEMAS.md](./doc/SCHEMAS.md) for currently supported schemas.
37
+
38
+ When you are happy that your bot is finished, please update its
39
+ `README.md`, change the `enabled` flag in `config.yml` to be `true`,
40
+ and email us.
41
+
42
+ Please note that dates are a bit complicated, so we ask you to read
43
+ the bit about dates below carefully.
44
+
45
+ ## About fetching and transforming data
46
+
47
+ As you'll see in the sample bot, bots have separate steps to fetch
48
+ data (the `fetch_all_records` method) and to transform it to a format
49
+ suitable for OpenCorporates (the `to_pipeline` method).
50
+
51
+ It is useful to have separate *fetch* and *export* phase for a couple
52
+ of reasons:
53
+
54
+ * For very large source datasets, it can take months to complete a
55
+ scrape. It is then useful to verify the data quality before
56
+ ingesting it in OpenCorporates.
57
+ * Often, datasets may include a load of potentially interesting data
58
+ which OpenCorporates doesn't yet support. It's worth storing this
59
+ data in an intermediate format, to save having to scrape it again in
60
+ the future. Please save anything like that and make a note of it in
61
+ your `README.md`.
62
+
63
+ For more complicated scrapers, you may wish to do things more manually
64
+ -- see [README-complex.md](./doc/README-complex.md) for more info.
65
+
66
+ # A few words about dates
67
+
68
+ There are three kinds of dates that OpenCorporates deals with:
69
+
70
+ 1. The date on which an observation was true: the `sample_date`. This
71
+ is the date of a bot run, or a reporting date given in the source
72
+ document. Every observation **must have a sample date**.
73
+ 2. A `start_date` and/or `end_date` defined explicitly in the source
74
+ document
75
+ 3. A `start_date` or `end_date` that has not been provided by the
76
+ source, but which OpenCorporates can infer from one or more sample
77
+ dates. *In this case, you just supply a sample_date, and we do the
78
+ rest*
79
+
80
+ All dates should be in ISO8601 format.
81
+
82
+ ## A few more words about dates
83
+
84
+ One of the important parts of the data format expected by
85
+ OpenCorporates are the dates a statement is known to be true.
86
+
87
+ All statements can be considered to be true between a start date and
88
+ an end date. Sources that make explicit statements like this are great
89
+ - but they're rare. For sources that don't explicitly define start and
90
+ end dates for statements, it is down to OpenCorporates to compute
91
+ these based on the bot's run schedule, and sample dates in the source
92
+ data.
93
+
94
+ Imagine you are interested in mining licenses in Liliput and
95
+ Brobdingnag, and you want to provide this data to OpenCorporates. You
96
+ find a website that lists mining licenses for these jurisdictions, so
97
+ you write a bot that can submit each license.
98
+
99
+ You find that Liliputian licenses have a definied start date and a
100
+ definied end date, which mean you can explicitly say "this license is
101
+ valid between 1 June 2012 and 31 Aug 2013" for a particular license.
102
+
103
+ In this case, you would submit the data with a `start_date` of
104
+ `2012-06-01` and an `end_date` of `2013-08-31`; and a
105
+ `start_date_type` of `=` and an `end_date_type` of `=`. You would
106
+ also submit a `sample_date` for that document, which is the date on
107
+ which the license was known to be current (often today's date, but
108
+ sometimes the reporting date given in the source).
109
+
110
+ However, you find that Brobdingnagian licenses only tell you currently
111
+ issued licenses. As a bot writer, all you can say of a particular
112
+ license is "I saw this license when we ran the bot on 15 January
113
+ 2012". In this case, you would leave `start_date` and `end_date`
114
+ blank, and submit a `sample_date` of `2012-01-15` instead.
115
+
116
+ If you subsequently see the license on 15 February, you'd submit
117
+ exactly the same data with a new `sample_date`.
118
+
119
+ A bot is expected to be run periodically, at intervals relevant to its
120
+ source. For example, a bot that scrapes data which changes monthly should
121
+ scrape at least monthly. You should indicate this in the bot's
122
+ `config.yml` file.
123
+
124
+ This means OpenCorporates can infer, based on the running schedule of
125
+ the bot, and the `sample_date`s of its data, the dates between which a
126
+ license was valid (in this case, between 15 January and 15 February).
127
+
128
+ Hence the above.
129
+
130
+ # Speeding up your tests
131
+
132
+ When writing scrapers, it's common to find yourself repeatedly
133
+ scraping data from a source as you iteratively improve your code. It
134
+ can be useful to use a caching proxy on your development machine to
135
+ speed up this cycle.
136
+
137
+ If you run `bundle exec openc_bot rake bot:run -- --test`, then your
138
+ `fetch_records` method will receive an option `test_mode`; you can use
139
+ this to turn proxying on or off. Here's how you can set a proxy using
140
+ the `mechanize` library; if you want to use different http client
141
+ libraries, refer to their documentation regarding how to set a proxy.
142
+
143
+ agent = Mechanize.new
144
+ if opts[:test_mode]
145
+ # this requires you to have a working proxy set up -- see
146
+ # README.md for notes. It can speed up development considerably.
147
+ agent.set_proxy 'localhost', 8123
148
+ end
149
+ agent.get("http://www.foo.com") # will get it from local cache the second time
150
+
151
+ To make this work, you will also want to set up a caching proxy
152
+ listening on `localhost:8123`. One such lightweight proxy is
153
+ [polipo](http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/),
154
+ which is available packaged for various platforms. The following
155
+ options in the config work for us:
156
+
157
+ cacheIsShared = false
158
+ disableIndexing = false
159
+ disableServersList = false
160
+ relaxTransparency = yes
161
+ dontTrustVaryETag = yes
162
+ proxyOffline = no
163
+
164
+ # Targetting specific records
165
+
166
+ If you define an (optional) `fetch_specific_records` method in your
167
+ bot, then you can specify particular records you wish to be
168
+ fetched, thus:
169
+
170
+ bundle exec openc_bot rake bot:run -- --identifier "Foo Corp"
171
+
172
+ You can also target specific records to export with:
173
+
174
+ bundle exec openc_bot rake bot:export -- --identifier "Foo Corp"
175
+
176
+ # Incremental, resumable searches
177
+
178
+ It's often necessary to do incremental searches or scrapes to get a
179
+ full set of data. For example, you may know that all the records exist
180
+ at urls like http://foo.com/?page=1, http://foo.com/?page=2, etc.
181
+
182
+ Another common use case is where you can only access records with a
183
+ search. In these cases, there's no alternative except to search for
184
+ all the possible permutations of the letters A-Z and numbers 0-9 (in
185
+ the case of ASCII-searchable databases).
186
+
187
+ In the latter case, this is 46656 different possible
188
+ permutations. This will take a long time to scrape. If for some reason
189
+ the scraper gets interrupted, you don't want to have to start again.
190
+
191
+ We provide some convenience iterators, which save their current state,
192
+ and restart unless told otherwise. They are probably not worth using for
193
+ small scrapes (e.g. ones that take 10 mins) as they add to the complexity
194
+ of your code; however, they are invaluable for large scrapes that may well
195
+ get interrupted.
196
+
197
+ # currently provides a NumericIncrementer and an AsciiIncrementer:
198
+ require 'openc_bot/incrementers'
199
+
200
+ def fetch_all_records(opts={})
201
+ counter = NumericIncrementer.new(
202
+ :my_incrementer,
203
+ opts.merge(
204
+ :start_val => 0,
205
+ :end_val => 20))
206
+
207
+ # yield records one at a time, resuming by default
208
+ counter.resumable.each do |num|
209
+ url = "http://assets.opencorporates.com/test_bot_page_#{num}.html"
210
+ yield record_from_url(url)
211
+ end
212
+ end
213
+
214
+ The above code would resume an incremental search automatically. To
215
+ reset, run the bot thus:
216
+
217
+ bundle exec openc_bot rake bot:run -- --reset
218
+
219
+ When debugging, it is useful to test out only a few iterations at a time. To do this:
220
+
221
+ bundle exec openc_bot rake bot:run -- --max-iterations=3
222
+
223
+ This will restrict all iterators to a maximum of three iterations.
224
+
225
+ There's also an incrementer which you can manually fill with records
226
+ (arbitrary hashes), thus:
227
+
228
+ incrementer = OpencBot::ManualIncrementer.new(
229
+ :my_incrementer,
230
+ opts.merge(:fields => [:num]))
231
+
232
+ (0..10).each do |num|
233
+ incrementer.add_row({'num' => num})
234
+ end
235
+
236
+ # now increment over its values, resuming where we left off last time if interrupted
237
+ incrementer.resumable.each do |item|
238
+ doc = agent.get("http://assets.opencorporates.com/document_number#{item["num"]}"
239
+ end
240
+
241
+ ManualIncrementers also have a persisted field named `populated`,
242
+ which you can use to skip expensive record-filling if it's already
243
+ been done:
244
+
245
+ if !incrementer.populated
246
+ (0..10).each do |num|
247
+ incrementer.add_row({'num' => num})
248
+ end
249
+ end
250
+ incrementer.populated = true
251
+
252
+ There are examples of how this can work in
253
+ `examples/bot_with_simple_iterator`.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require "bundler/gem_tasks"
2
+ # load 'lib/tasks/openc_bot.rake'
3
+ # require 'lib/tasks'
4
+ require 'openc_bot/tasks'
5
+
6
+
7
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../../lib'
8
+ # require 'resque/tasks'
9
+
10
+ Dir.glob('lib/tasks/*.rake').each { |r| import r }
11
+
12
+ require 'rspec/core/rake_task'
13
+ task :default => :spec
14
+ RSpec::Core::RakeTask.new
data/bin/openc_bot ADDED
@@ -0,0 +1,13 @@
1
+ gem_dir = File.expand_path("..",File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift gem_dir# Look in gem directory for resources first.
3
+ exec_type = ARGV[0]
4
+ if exec_type == 'rake' then
5
+ require 'rake'
6
+ require 'pp'
7
+ pwd=Dir.pwd
8
+ Dir.chdir(gem_dir) # We'll load rakefile from the gem's dir.
9
+ Rake.application.init
10
+ Rake.application.load_rakefile
11
+ Dir.chdir(pwd) # Revert to original pwd for any path args passed to task.
12
+ Rake.application.invoke_task(ARGV[1])
13
+ end
data/create_bot.sh ADDED
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Add the openc_bot to the Gemfile:
6
+ if [ ! -f Gemfile ]; then
7
+ echo "source 'https://rubygems.org'" >> Gemfile
8
+ echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
9
+ fi
10
+
11
+ echo "/db/*" >> .gitignore
12
+ echo "/data/*" >> .gitignore
13
+ echo "/tmp/*" >> .gitignore
14
+ echo "/pids/*" >> .gitignore
15
+ echo "!.gitkeep" >> .gitignore
16
+
17
+ mkdir -p db
18
+ mkdir -p data
19
+ mkdir -p tmp
20
+ mkdir -p pids
21
+
22
+ touch db/.gitkeep
23
+ touch data/.gitkeep
24
+ touch tmp/.gitkeep
25
+ touch pids/.gitkeep
26
+
27
+ bundle install
28
+ # create the bot
29
+ bundle exec openc_bot rake bot:create
30
+ bundle install
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Add the openc_bot to the Gemfile:
6
+ if [ ! -f Gemfile ]; then
7
+ echo "source 'https://rubygems.org'" >> Gemfile
8
+ echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git', :branch => 'company_fetcher_bot'" >> Gemfile
9
+ fi
10
+ echo "/db" >> .gitignore
11
+ echo "/data" >> .gitignore
12
+ echo "/tmp" >> .gitignore
13
+ bundle install
14
+ # create the bot
15
+ bundle exec openc_bot rake bot:create_company_bot
16
+ bundle install
@@ -0,0 +1,31 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Add the openc_bot to the Gemfile:
6
+ if [ ! -f Gemfile ]; then
7
+ echo "source 'https://rubygems.org'" >> Gemfile
8
+ echo "gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'" >> Gemfile
9
+ echo "gem 'mechanize'" >> Gemfile
10
+ fi
11
+
12
+ echo "/db/*" >> .gitignore
13
+ echo "/data/*" >> .gitignore
14
+ echo "/tmp/*" >> .gitignore
15
+ echo "/pids/*" >> .gitignore
16
+ echo "!.gitkeep" >> .gitignore
17
+
18
+ mkdir -p db
19
+ mkdir -p data
20
+ mkdir -p tmp
21
+ mkdir -p pids
22
+
23
+ touch db/.gitkeep
24
+ touch data/.gitkeep
25
+ touch tmp/.gitkeep
26
+ touch pids/.gitkeep
27
+
28
+ bundle install
29
+ # create the bot
30
+ bundle exec openc_bot rake bot:create_simple_bot
31
+ bundle install