bankjob 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,114 @@
1
+
2
+ require 'rubygems'
3
+ require 'builder'
4
+ require 'digest/md5'
5
+
6
+ module Bankjob
7
+
8
+ ##
9
+ # A Payee object represents an entity in a in a bank Transaction that receives a payment.
10
+ #
11
+ # A Scraper will create Payees while scraping web pages in an online banking site.
12
+ # In many cases Payees will not be distinguished in the online bank site in which case
13
+ # rules will have to be applied to separate the Payees
14
+ #
15
+ # A Payee object knows how to write itself as a record in a CSV
16
+ # (Comma Separated Values) file using +to_csv+ or as an XML element in an
17
+ # OFX (Open Financial eXchange http://www.ofx.net) file using +to_ofx+
18
+ #
19
+ class Payee
20
+
21
+ # name of the payee
22
+ # Translates to OFX element NAME
23
+ attr_accessor :name
24
+
25
+ # address of the payee
26
+ # Translates to OFX element ADDR1
27
+ #-- TODO Consider ADDR2,3
28
+ attr_accessor :address
29
+
30
+ # city in which the payee is located
31
+ # Translates to OFX element CITY
32
+ attr_accessor :city
33
+
34
+ # state in which the payee is located
35
+ # Translates to OFX element STATE
36
+ attr_accessor :state
37
+
38
+ # post code or zip in which the payee is located
39
+ # Translates to OFX element POSTALCODE
40
+ attr_accessor :postalcode
41
+
42
+ # country in which the payee is located
43
+ # Translates to OFX element COUNTRY
44
+ attr_accessor :country
45
+
46
+ # phone number of the payee
47
+ # Translates to OFX element PHONE
48
+ attr_accessor :phone
49
+
50
+ ##
51
+ # Generates a string representing this Payee as a single string for use
52
+ # in a comma separated values column
53
+ #
54
+ def to_csv
55
+ name
56
+ end
57
+
58
+ ##
59
+ # Generates an XML string adhering to the OFX standard
60
+ # (see Open Financial Exchange http://www.ofx.net)
61
+ # representing a single Payee XML element.
62
+ #
63
+ # The schema for the OFX produced is
64
+ #
65
+ # <xsd:complexType name="Payee">
66
+ # <xsd:annotation>
67
+ # <xsd:documentation>
68
+ # The OFX element "PAYEE" is of type "Payee"
69
+ # </xsd:documentation>
70
+ # </xsd:annotation>
71
+ # <xsd:sequence>
72
+ # <xsd:element name="NAME" type="ofx:GenericNameType"/>
73
+ # <xsd:sequence>
74
+ # <xsd:element name="ADDR1" type="ofx:AddressType"/>
75
+ # <xsd:sequence minOccurs="0">
76
+ # <xsd:element name="ADDR2" type="ofx:AddressType"/>
77
+ # <xsd:element name="ADDR3" type="ofx:AddressType" minOccurs="0"/>
78
+ # </xsd:sequence>
79
+ # </xsd:sequence>
80
+ # <xsd:element name="CITY" type="ofx:AddressType"/>
81
+ # <xsd:element name="STATE" type="ofx:StateType"/>
82
+ # <xsd:element name="POSTALCODE" type="ofx:ZipType"/>
83
+ # <xsd:element name="COUNTRY" type="ofx:CountryType" minOccurs="0"/>
84
+ # <xsd:element name="PHONE" type="ofx:PhoneType"/>
85
+ # </xsd:sequence>
86
+ # </xsd:complexType>
87
+ #
88
+ def to_ofx
89
+ buf = ""
90
+ # Set margin=6 to indent it nicely within the output from Transaction.to_ofx
91
+ x = Builder::XmlMarkup.new(:target => buf, :indent => 2, :margin=>6)
92
+ x.PAYEE {
93
+ x.NAME name
94
+ x.ADDR1 address
95
+ x.CITY city
96
+ x.STATE state
97
+ x.POSTALCODE postalcode
98
+ x.COUNTRY country unless country.nil? # minOccurs="0" in schema (above)
99
+ x.PHONE phone
100
+ }
101
+ return buf
102
+ end
103
+
104
+ ##
105
+ # Produces the Payee as a row of comma separated values
106
+ # (delegates to +to_csv+)
107
+ #
108
+ def to_s
109
+ to_csv
110
+ end
111
+
112
+ end # class Payee
113
+ end # module
114
+
@@ -0,0 +1,495 @@
1
+
2
+ require 'rubygems'
3
+ require 'mechanize'
4
+ require 'logger'
5
+ require 'bankjob'
6
+
7
+ module Bankjob
8
+
9
+ ##
10
+ # The Scraper class is the basis of all Bankjob web scrapers for scraping specific
11
+ # bank websites.
12
+ #
13
+ # To create your own scraper simply subclass Scraper and be sure to override
14
+ # the method +scrape_statement+ to perform the scraping and return a
15
+ # Bankjob::Statement object.
16
+ #
17
+ # Scraper provides some other optional methods to help you build Statements:
18
+ #
19
+ # +currency+:: use this class attribute to set the OFX currency at the top of
20
+ # your Scraper subclass definition. E.g.:
21
+ #
22
+ #
23
+ # class MyScraper < Scraper
24
+ # currency "USD"
25
+ # ...
26
+ #
27
+ # It defaults to "EUR" for euros.
28
+ #
29
+ # +decimal+:: use this class attribute to set the decimal separator at the top of
30
+ # your Scraper subclass definition. E.g.:
31
+ #
32
+ # class MyScraper < Scraper
33
+ # decimal ","
34
+ # ...
35
+ #
36
+ # It defaults to "." (period), the common alternative being "," (comma)
37
+ #
38
+ # Note that this should be set to the separator used in the +amount+
39
+ # attribute of the Transaction objects your Scraper creates. If, say,
40
+ # you deliberately scrape values like "12,34" and convert them to
41
+ # "12.34" before storing them in your Transaction, then leave the
42
+ # decimal as ".".
43
+ # If you choose to store the Transaction amount with as "12,34",
44
+ # however, the +decimal+ setting becomes important when calling
45
+ # Transaction#real_amount to get the amount as a Float upon which
46
+ # calculations can be performed.
47
+ #
48
+ # +options+:: holds the command line options provided when Bankjob was launched.
49
+ # Use this attribute to get access to global options. For your scraper
50
+ # specific options use the array passed into +scrape_statement+ instead.
51
+ # (See #options below for more advice on how to use this)
52
+ #
53
+ # +logger+:: holds the logger initialized by Bankjob based on the command line
54
+ # options. Use this to attribute to log information, warnings and debug messages
55
+ # from your logger.
56
+ # (See #logger below for more advice on how to use this)
57
+ #
58
+ # +create_statement+:: creates a new empty Statement object with the appropriate
59
+ # default attributes (that is, the right currency)
60
+ # Use this in your Scraper to instantiate new Statement objects.
61
+ #
62
+ # +create_transaction+:: creates a new empty Transaction object with the appropriate
63
+ # default attributes (that is, the right decimal separator)
64
+ # Use this in your Scraper to instantiate new Transaction objects.
65
+ #
66
+ # +transaction_rule+:: registers a rule to be applied to all transactions after the
67
+ # statement has been scraped.
68
+ # Define as many of these as you need in your craper to build better
69
+ # organized Transaction objects with clearer descriptions of the
70
+ # transaction, etc.
71
+ #
72
+ # Here is an example of a simple (but incomplete) scraper.
73
+ # Note that all of the scraping and parsing is in the +scrape_statement+ method, although
74
+ # a lot of the details of Hpricot parsing are left up to the imagination of the reader.
75
+ #
76
+ # When creating a scraper yourself look in the +scrapers+ directory of the bankjob gem
77
+ # to see some more useful examples.
78
+ #
79
+ # class AcmeBankScraper < Scraper
80
+ # #####
81
+ # # 1. Set up the Scraper properties for currency and separator
82
+ # # (this is optional)
83
+ #
84
+ # currency "EUR" # set the currency (EUR is the default anyway but just to demo..)
85
+ # decimal "," # set the decimal separator to comma instead of .
86
+ #
87
+ # #####
88
+ # # 2. Create some rules to post-process my transactions
89
+ # # (this is optional but is easier to maintain than manipulating
90
+ # # the values in the scraper itself)
91
+ #
92
+ # # rule to set negative transactions as debits
93
+ # transaction_rule do |tx|
94
+ # tx.type = "DEBIT" if (tx.real_amount < 0 and tx.type == "OTHER")
95
+ # end
96
+ #
97
+ # # General description parsing rule
98
+ # transaction_rule do |tx|
99
+ # case tx.description
100
+ # when /ATM/i
101
+ # tx.type = "ATM"
102
+ # when /ELEC PURCHASE/
103
+ # tx.description.gsub!(/ELEC PURCHASE \d+/, "spent with ATM card: ")
104
+ # end
105
+ # end
106
+ #
107
+ # #####
108
+ # # 3. Implement main engine of the scraper
109
+ # # (this is essential and where 99% of the work is)
110
+ #
111
+ # def scrape_statement(args)
112
+ #
113
+ # logger.debug("Reading debug input html from #{options.input} instead of scraping the real website.")
114
+ # agent = WWW::Mechanize.new
115
+ # agent.user_agent_alias = 'Windows IE 6' # pretend that we're IE 6.0
116
+ # # navigate to the login page
117
+ # login_page = agent.get("http://mybank.com/login")
118
+ # # find login form, fill it out and submit it
119
+ # form = login_page.forms.name('myBanksLoginForm').first
120
+ # # Mechanize creates constants like USERNAME for the form element it finds with that name
121
+ # form.USERNAME = args[0] # assuming -scraper_args "user password"
122
+ # form.PASSWORD = args[1]
123
+ # agent.submit(form)
124
+ # sleep 3 #wait while the login takes effect
125
+ #
126
+ # transactions_page = agent.get("http://mybank.com/transactions")
127
+ # statement = create_statement
128
+ #
129
+ # # ... go read the Hpricot documentation to work out how to get your transactions out of
130
+ # # the transactions_page and create a new transaction object for each one
131
+ # # We're going to gloss over that part here ....
132
+ #
133
+ # table = # use Hpricot to get the html table element assuming your transactions are in a table
134
+ # rows = (table/"tr[@valign=top]") # works for a table where the rows needed have the valign attr set to top
135
+ # rows.each do |row|
136
+ # transaction = create_transaction
137
+ # transaction.date = #... scrape a date here
138
+ # ...
139
+ # statement.transactions <<
140
+ # end
141
+ # end
142
+ # end
143
+ #
144
+ #--
145
+ # (Non RDOC comment) There are two parts to the Scraper class:
146
+ # - the public part which defines the
147
+ # method to be overridden in subclasses and provides utility methods and attributes;
148
+ # - the private internal part which handles the mechanics of registering a
149
+ # subclass as the scraper to be used, setting the currency and decimal attributes
150
+ # and registering transaction rules
151
+ #
152
+ #
153
+ class Scraper
154
+
155
+ ##
156
+ # Provides access to a logger instance created in the BankjobRunner which
157
+ # subclasses can use for logging if they need to.
158
+ #
159
+ # To use this in your own scraper, use code like:
160
+ #
161
+ # include 'logger'
162
+ # ...
163
+ # logger.debug("MyScraper is scraping the page at #{my_url}")
164
+ # logger.info("MyScraper fetched new statement from MyBank and has been sitting in my chair")
165
+ # logger.warn("MyScraper's been sitting in MY chair!")
166
+ # logger.fatal("MyScraper's been sitting in MY CHAIR and IT'S ALL BROKEN!")
167
+ #
168
+ attr_accessor :logger
169
+
170
+ ##
171
+ # Provides access to the command line options which subclasses can use it if
172
+ # they need access to the global options used to launch Bankjob
173
+ #
174
+ # To use this in your own scraper, use code like:
175
+ #
176
+ # if (options.input?) then
177
+ # print "the input html file for debugging is #{options.input}
178
+ # end
179
+ #
180
+ attr_accessor :options
181
+
182
+ ##
183
+ # Returns the decimal separator for this scraper
184
+ # This is typically set in the scraper class using the "decimal" directive.
185
+ #
186
+ def decimal
187
+ @@decimal
188
+ end
189
+
190
+ ##
191
+ # Returns the OFX currency for this scraper.
192
+ # This is typically set in the scraper class using the "currency" directive.
193
+ #
194
+ def currency
195
+ @@currency
196
+ end
197
+
198
+ ##
199
+ # Sets the decimal separator for the money amounts used in the data fetched
200
+ # by this scraper.
201
+ # The scraper class can use this as a directive to set the separator so:
202
+ # decimal ","
203
+ #
204
+ # Defaults to period ".", but will typically need to be set as a comma in
205
+ # european websites
206
+ #
207
+ def self.decimal(decimal)
208
+ @@decimal = decimal
209
+ end
210
+
211
+ ##
212
+ # Sets the OFX currency name for use in the OFX statements produced by
213
+ # this scraper.
214
+ #
215
+ # The scraper class can use this as a directive to set the separator so:
216
+ # currency "USD"
217
+ #
218
+ # Defaults to EUR
219
+ #
220
+ def self.currency(currency)
221
+ @@currency = currency
222
+ end
223
+
224
+ ##
225
+ # Sets the account number for statements produced by this statement.
226
+ #
227
+ # The scraper class can use this as a directive to set the number so:
228
+ # account_number "12345678"
229
+ #
230
+ # Must be a string from 1 to 22 chars in length
231
+ #
232
+ # This will be used by the create_statement method to set the account,
233
+ # but the scraper may ignore this and simply construct its own statements
234
+ # or change the number using the accessor: statement.account_number =
235
+ # after constructing it.
236
+ #
237
+ # The scraper class can use this as a directive to set the separator so:
238
+ # currency "USD"
239
+ #
240
+ # Defaults to EUR
241
+ #
242
+ def self.account_number(account_number)
243
+ @@account_number = account_number
244
+ end
245
+
246
+ ##
247
+ # Sets the account type for statements produced by this statement.
248
+ #
249
+ # The scraper class can use this as a directive to set the type so:
250
+ # account_type Statement::SAVINGS
251
+ #
252
+ # Must be a string based on one of the constants in Statement
253
+ #
254
+ # This will be used by the create_statement method to set the account type,
255
+ # but the scraper may ignore this and simply construct its own statements
256
+ # or change the type using the accessor: statement.account_type =
257
+ # after constructing it.
258
+ #
259
+ # Defaults to Statement::CHECKING
260
+ #
261
+ def self.account_type(account_type)
262
+ @@account_type = account_type
263
+ end
264
+
265
+ ##
266
+ # Sets the bank identifier for statements produced by this statement.
267
+ #
268
+ # The scraper class can use this as a directive to set the number so:
269
+ # bank_id "12345678"
270
+ #
271
+ # Must be a string from 1 to 9 chars in length
272
+ #
273
+ # This will be used by the create_statement method to set the bank id,
274
+ # but the scraper may ignore this and simply construct its own statements
275
+ # or change the number using the accessor: statement.bank_id =
276
+ # after constructing it.
277
+ #
278
+ # Defaults to blank
279
+ #
280
+ def self.bank_id(bank_id)
281
+ @@bank_id = bank_id
282
+ end
283
+
284
+ ##
285
+ # ScraperRule is a struct used for holding a rule body with its priority.
286
+ # Users can create transaction rules in their Scraper subclasses using
287
+ # the Scraper#ransaction_rule method.
288
+ ScraperRule = Struct.new(:priority, :rule_body)
289
+
290
+ ##
291
+ # Processes a transaction after it has been created to allow it to be manipulated
292
+ # into a more useful form for the client.
293
+ #
294
+ # For example, the transaction description might be simplified to remove certain
295
+ # common strings, or the Payee details might be extracted from the description.
296
+ #
297
+ # Implementing this as a class method using a block permits the user to add
298
+ # implement transaction processing rules by calling this method several times
299
+ # rather than implementing a single method (gives it a sort of DSL look)
300
+ #
301
+ # E.g.
302
+ # # This rule detects ATM withdrawals and modifies
303
+ # # the description and sets the the type it uses
304
+ # transaction_rule do |tx|
305
+ # if (tx.real_amount < 0)
306
+ # if tx.raw_description =~ /WDR.*ATM\s+\d+\s+/i
307
+ # # $' holds whatever is after the pattern match - usually the ATM location
308
+ # tx.description = "ATM withdrawal at #{$'}"
309
+ # tx.type = Transaction::ATM
310
+ # end
311
+ # end
312
+ # end
313
+ #
314
+ #
315
+ # A transaction rule can optionally specifiy a +priority+ - any integer value.
316
+ # The default priority is zero, with lower priority rules being executed last.
317
+ #
318
+ # The final order in which transaction rules will be executed is thus:
319
+ # * rules with a higher priority value will be executed before rules with
320
+ # a lower priority no matter where they are declared
321
+ # * rules of the same priority declared in the same class wil be executed in
322
+ # the order in which they are declared - top rules first
323
+ # * rules in parent classes are executed before rules in subclasses of the
324
+ # same priority.
325
+ #
326
+ # If you really want a rule to be fired last, and you want to allow for
327
+ # subclasses to your scraper, use a negative priority like this:
328
+ #
329
+ # transaction_rule(-999) do |tx|
330
+ # puts "I get executed last"
331
+ # end
332
+ #
333
+ def self.transaction_rule(priority = 0, &rule_body)
334
+ @@transaction_rules ||= []
335
+ rule = ScraperRule.new(priority, rule_body)
336
+ # Using Array#sort won't work on here (or later) because it doesn't preserve
337
+ # the order of the rules with equal priorty - thus breaking the
338
+ # rules of priority detailed above. So we have to sort as we insert
339
+ # each new rule in order without messing up the equal-priority order
340
+ # which is first come, first in.
341
+ # Imagine we have a set of rule already inorder of priority such as:
342
+ # A:999, B:999, C:0, D:0, E:-999, F:-999
343
+ # we're now adding X:0, which should come after D since it's added later
344
+ # First we reverse the array to get
345
+ # F:-999, E:-999, D:0, C:0, B:999, A:999
346
+ # then we find the first element with priority greater than or equal to
347
+ # X's priority of 0. Just greater than won't work because we'll end up
348
+ # putting X between B and C whereas it was added after D.
349
+ # So we find D, then get it's index in the original array which is 3
350
+ # which tells us we can insert X at 4 into the forward-sorted rules
351
+ #
352
+ rev = @@transaction_rules.reverse
353
+ last_higher_or_equal = rev.find { |r| r.priority.to_i >= priority }
354
+ if last_higher_or_equal.nil?
355
+ # insert a the start of the list
356
+ @@transaction_rules.insert(0, rule)
357
+ else
358
+ index_of_last = @@transaction_rules.index(last_higher_or_equal)
359
+ # now insert it after the last higher or equal priority rule
360
+ @@transaction_rules.insert(index_of_last + 1, rule)
361
+ end
362
+ end
363
+
364
+ ##
365
+ # Runs through all of the rules registered with calls to +transaction_rule+
366
+ # and applies them to each Transaction in the specified +statement+.
367
+ #
368
+ # Bankjob calls this after +scrape_statement+ and before writing out the
369
+ # statement to CSV or OFX
370
+ #
371
+ def self.post_process_transactions(statement) #:nodoc:
372
+ if defined?(@@transaction_rules)
373
+ @@transaction_rules.each do |rule|
374
+ statement.transactions.each do |transaction|
375
+ rule.rule_body.call(transaction)
376
+ end
377
+ end
378
+ end
379
+ return statement
380
+ end
381
+
382
+ ##
383
+ # Scrapes a website to produce a new Statement object.
384
+ #
385
+ # This is the one method which a Scraper *must* implement by overriding
386
+ # this method.
387
+ #
388
+ # Override this in your own Scraper to use Mechanize and Hpricot (or
389
+ # some other mechanism if you prefer) to parse your bank website
390
+ # and create a Bankjob::Statement object to hold the data.
391
+ #
392
+ # The implementation here will raise an error if not overridden.
393
+ #
394
+ def scrape_statement
395
+ raise "You must override the instance method scrape_statement in your scraper!"
396
+ end
397
+
398
+ ##
399
+ # Creates a new Statement.
400
+ #
401
+ # Calling this method is the preferred way of creating a new Statement object
402
+ # since it sets the OFX currency (and possibly other attributes) based on the
403
+ # values set in the definition of the Scraper subclass.
404
+ # It is otherwise no different, however, than calling Statement.new() yourself.
405
+ #
406
+ def create_statement
407
+ statement = Statement.new(@@account_number, @@currency)
408
+ statement.bank_id = @@bank_id if defined?(@@bank_id)
409
+ statement.account_type = @@account_type if defined?(@@account_type)
410
+ return statement
411
+ end
412
+
413
+ ##
414
+ # Creates a new Transaction.
415
+ #
416
+ # Calling this method is the preferred way of creating a new Transaction object
417
+ # since it sets the decimal separator (and possibly other attributes) based on the
418
+ # values set in the definition of the Scraper subclass.
419
+ #
420
+ # It is otherwise no different, however, than calling Transaction.new() yourself.
421
+ #
422
+ def create_transaction
423
+ Transaction.new(@@decimal)
424
+ end
425
+
426
+ ##
427
+ # Private
428
+ #
429
+ # The internal workings of the Scraper come after this point - they
430
+ # are not documented in RDOC
431
+ ##
432
+
433
+ #SCRAPER_INTERFACE is the list of methods that a scraper must define
434
+ SCRAPER_INTERFACE = [:scrape_statement]
435
+
436
+ # set up the directories in which user's scrapers will be sought
437
+ HOME_DIR = File.dirname(__FILE__);
438
+ SCRAPERS_DIR = File.join(HOME_DIR, "..", "..", "scrapers")
439
+
440
+ ##
441
+ # +inherited+ is always called when a class extends Scraper.
442
+ # The subclass itself is passed in as +scraper_class+ alllowing
443
+ # us to register it to be instantiated later
444
+ #
445
+ def self.inherited(scraper_class) #:nodoc:
446
+ # verify that the scraper class indeed defines the necessary methods
447
+ SCRAPER_INTERFACE.each do |method|
448
+ if (not scraper_class.public_method_defined?(method))
449
+ raise "Invalid scraper: the scraper class #{scraper_class.name} does not define the method #{method}"
450
+ end
451
+ end
452
+ # in the future we might keep a registry of scrapers but for now
453
+ # we assume there will always be one, and just register that class
454
+ @@last_scraper_class = scraper_class
455
+ end
456
+
457
+ ##
458
+ # This is the main method of the dynamic Scraper-loader: It loads
459
+ # the actual scraper ruby file and initializes the class therein.
460
+ #
461
+ # Note that no assumption is made about the name of the class
462
+ # defined within the specified +scraper_filename+. Rather, the
463
+ # +self.inherited+ method will hold a reference to the last
464
+ # class loaded that extends Bankjob::Scraper and that reference
465
+ # is used here to initialize the class immediately after load()
466
+ # is called on the specified file.
467
+ #
468
+ def self.load_scraper(scraper_filename, options, logger) #:nodoc:
469
+ # temporarily add the same dir as bankjob and the scrapers dir
470
+ # to the ruby LOAD_PATH for finding the scraper
471
+ begin
472
+ $:.unshift(HOME_DIR)
473
+ $:.unshift(SCRAPERS_DIR)
474
+ logger.debug("About to load the scraper file named #{scraper_filename}")
475
+ load(scraper_filename)
476
+ rescue Exception => e
477
+ logger.error("Failed to load the scraper file #{scraper_filename} due to #{e.message}.\n\t#{e.backtrace[0]}")
478
+ ensure
479
+ $:.delete(SCRAPERS_DIR)
480
+ $:.delete(HOME_DIR)
481
+ end
482
+
483
+ if (not defined?(@@last_scraper_class) or @@last_scraper_class.nil?)
484
+ raise "Cannot initialize the scraper as none was loaded successfully."
485
+ else
486
+ logger.debug("About to instantiate scraper class: #{@@last_scraper_class.name}\n")
487
+ scraper = @@last_scraper_class.new()
488
+ scraper.logger = logger
489
+ scraper.options = options
490
+ end
491
+
492
+ return scraper
493
+ end # init_scraper
494
+ end # Scraper
495
+ end # module Bankjob