bankjob 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/PostInstall.txt +4 -0
- data/README.rdoc +77 -0
- data/bin/bankjob +10 -0
- data/lib/bankjob.rb +12 -0
- data/lib/bankjob/bankjob_runner.rb +184 -0
- data/lib/bankjob/cli.rb +258 -0
- data/lib/bankjob/payee.rb +114 -0
- data/lib/bankjob/scraper.rb +495 -0
- data/lib/bankjob/statement.rb +355 -0
- data/lib/bankjob/support.rb +217 -0
- data/lib/bankjob/transaction.rb +400 -0
- data/scrapers/base_scraper.rb +133 -0
- data/scrapers/bpi_scraper.rb +190 -0
- data/spec/bankjob_cli_spec.rb +15 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/statement_spec.rb +121 -0
- data/spec/transaction_spec.rb +81 -0
- metadata +114 -0
@@ -0,0 +1,400 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'builder'
|
4
|
+
require 'digest/md5'
|
5
|
+
require 'bankjob.rb'
|
6
|
+
|
7
|
+
module Bankjob
|
8
|
+
|
9
|
+
##
|
10
|
+
# A Transaction object represents a transaction in a bank account (a withdrawal, deposit,
|
11
|
+
# transfer, etc) and is generally the result of running a Bankjob scraper.
|
12
|
+
#
|
13
|
+
# A Scraper will create Transactions while scraping web pages in an online banking site.
|
14
|
+
# These Transactions will be collected in a Statement object which will then be written
|
15
|
+
# to a file.
|
16
|
+
#
|
17
|
+
# A Transaction object knows how to write itself as a record in a CSV
|
18
|
+
# (Comma Separated Values) file using +to_csv+ or as an XML element in an
|
19
|
+
# OFX (Open Financial eXchange http://www.ofx.net) file using +to_ofx+
|
20
|
+
#
|
21
|
+
class Transaction
|
22
|
+
|
23
|
+
# OFX transaction type for Generic credit
|
24
|
+
CREDIT = "CREDIT"
|
25
|
+
|
26
|
+
# OFX transaction type for Generic debit
|
27
|
+
DEBIT = "DEBIT"
|
28
|
+
|
29
|
+
# OFX transaction type for Interest earned or paid. (Depends on signage of amount)
|
30
|
+
INT = "INT"
|
31
|
+
|
32
|
+
# OFX transaction type for Dividend
|
33
|
+
DIV = "DIV"
|
34
|
+
|
35
|
+
# OFX transaction type for FI fee
|
36
|
+
FEE = "FEE"
|
37
|
+
|
38
|
+
# OFX transaction type for Service charge
|
39
|
+
SRVCHG = "SRVCHG"
|
40
|
+
|
41
|
+
# OFX transaction type for Deposit
|
42
|
+
DEP = "DEP"
|
43
|
+
|
44
|
+
# OFX transaction type for ATM debit or credit. (Depends on signage of amount)
|
45
|
+
ATM = "ATM"
|
46
|
+
|
47
|
+
# OFX transaction type for Point of sale debit or credit. (Depends on signage of amount)
|
48
|
+
POS = "POS"
|
49
|
+
|
50
|
+
# OFX transaction type for Transfer
|
51
|
+
XFER = "XFER"
|
52
|
+
|
53
|
+
# OFX transaction type for Check
|
54
|
+
CHECK = "CHECK"
|
55
|
+
|
56
|
+
# OFX transaction type for Electronic payment
|
57
|
+
PAYMENT = "PAYMENT"
|
58
|
+
|
59
|
+
# OFX transaction type for Cash withdrawal
|
60
|
+
CASH = "CASH"
|
61
|
+
|
62
|
+
# OFX transaction type for Direct deposit
|
63
|
+
DIRECTDEP = "DIRECTDEP"
|
64
|
+
|
65
|
+
# OFX transaction type for Merchant initiated debit
|
66
|
+
DIRECTDEBIT = "DIRECTDEBIT"
|
67
|
+
|
68
|
+
# OFX transaction type for Repeating payment/standing order
|
69
|
+
REPEATPMT = "REPEATPMT"
|
70
|
+
|
71
|
+
# OFX transaction type for Other
|
72
|
+
OTHER = "OTHER"
|
73
|
+
|
74
|
+
# OFX type of the transaction (credit, debit, atm withdrawal, etc)
|
75
|
+
# Translates to the OFX element TRNTYPE and according to the OFX 2.0.3 schema this can be one of
|
76
|
+
# * CREDIT
|
77
|
+
# * DEBIT
|
78
|
+
# * INT
|
79
|
+
# * DIV
|
80
|
+
# * FEE
|
81
|
+
# * SRVCHG
|
82
|
+
# * DEP
|
83
|
+
# * ATM
|
84
|
+
# * POS
|
85
|
+
# * XFER
|
86
|
+
# * CHECK
|
87
|
+
# * PAYMENT
|
88
|
+
# * CASH
|
89
|
+
# * DIRECTDEP
|
90
|
+
# * DIRECTDEBIT
|
91
|
+
# * REPEATPMT
|
92
|
+
# * OTHER
|
93
|
+
attr_accessor :type
|
94
|
+
|
95
|
+
# date of the transaction
|
96
|
+
# Translates to OFX element DTPOSTED
|
97
|
+
attr_accessor :date
|
98
|
+
|
99
|
+
# the date the value affects the account (e.g. funds become available)
|
100
|
+
# Translates to OFX element DTUSER
|
101
|
+
attr_accessor :value_date
|
102
|
+
|
103
|
+
# description of the transaction
|
104
|
+
# This description is typically set by taking the raw description and
|
105
|
+
# applying rules. If it is not set explicitly it returns the same
|
106
|
+
# value as +raw_description+
|
107
|
+
# Translates to OFX element MEMO
|
108
|
+
attr_accessor :description
|
109
|
+
|
110
|
+
# the original format of the description as scraped from the bank site
|
111
|
+
# This allows the raw information to be preserved when modifying the
|
112
|
+
# +description+ with transaction rules (see Scraper#transaction_rule)
|
113
|
+
# This does _not_ appear in the OFX output, only +description+ does.
|
114
|
+
attr_accessor :raw_description
|
115
|
+
|
116
|
+
# amount of the credit or debit (negative for debits)
|
117
|
+
# Translates to OFX element TRNAMT
|
118
|
+
attr_accessor :amount
|
119
|
+
|
120
|
+
# account balance after the transaction
|
121
|
+
# Not used in OFX but important for working out statement balances
|
122
|
+
attr_accessor :new_balance
|
123
|
+
|
124
|
+
# account balance after the transaction as a numeric Ruby Float
|
125
|
+
# Not used in OFX but important for working out statement balances
|
126
|
+
# in calculations (see #real_amount)
|
127
|
+
attr_reader :real_new_balance
|
128
|
+
|
129
|
+
# the generated unique id for this transaction in an OFX record
|
130
|
+
# Translates to OFX element FITID this is generated if not set
|
131
|
+
attr_accessor :ofx_id
|
132
|
+
|
133
|
+
# the payee of an expenditure (ie a debit or transfer)
|
134
|
+
# This is of type Payee and translates to complex OFX element PAYEE
|
135
|
+
attr_accessor :payee
|
136
|
+
|
137
|
+
# the cheque number of a cheque transaction
|
138
|
+
# This is of type Payee and translates to OFX element CHECKNUM
|
139
|
+
attr_accessor :check_number
|
140
|
+
|
141
|
+
##
|
142
|
+
# the numeric real-number amount of the transaction.
|
143
|
+
#
|
144
|
+
# The transaction amount is typically a string and may hold commas for
|
145
|
+
# 1000s or for decimal separators, making it unusable for mathematical
|
146
|
+
# operations.
|
147
|
+
#
|
148
|
+
# This attribute returns the amount converted to a Ruby Float, which can
|
149
|
+
# be used in operations like:
|
150
|
+
# <tt>
|
151
|
+
# if (transaction.real_amount < 0)
|
152
|
+
# puts "It's a debit!"
|
153
|
+
# end
|
154
|
+
#
|
155
|
+
# The +real_amount+ attribute is calculated using the +decimal+ separator
|
156
|
+
# passed into the constructor (defaults to ".")
|
157
|
+
# See Scraper#decimal
|
158
|
+
#
|
159
|
+
# This attribute is not used in OFX.
|
160
|
+
#
|
161
|
+
attr_reader :real_amount
|
162
|
+
|
163
|
+
##
|
164
|
+
# Creates a new Transaction with the specified attributes.
|
165
|
+
#
|
166
|
+
def initialize(decimal = ".")
|
167
|
+
@ofx_id = nil
|
168
|
+
@date = nil
|
169
|
+
@value_date = nil
|
170
|
+
@raw_description = nil
|
171
|
+
@description = nil
|
172
|
+
@amount = 0
|
173
|
+
@new_balance = 0
|
174
|
+
@decimal = decimal
|
175
|
+
|
176
|
+
# Always create a Payee even if it doesn't get used - this ensures an empty
|
177
|
+
# <PAYEE> element in the OFX output which is more correct and, for one thing,
|
178
|
+
# stops Wesabe from adding UNKNOWN PAYEE to every transaction (even deposits)
|
179
|
+
@payee = Payee.new()
|
180
|
+
@check_number = nil
|
181
|
+
@type = OTHER
|
182
|
+
end
|
183
|
+
|
184
|
+
def date=(raw_date_time)
|
185
|
+
@date = Bankjob.create_date_time(raw_date_time)
|
186
|
+
end
|
187
|
+
|
188
|
+
def value_date=(raw_date_time)
|
189
|
+
@value_date = Bankjob.create_date_time(raw_date_time)
|
190
|
+
end
|
191
|
+
|
192
|
+
##
|
193
|
+
# Creates a unique ID for the transaction for use in OFX documents, unless
|
194
|
+
# one has already been set.
|
195
|
+
# All OFX transactions need a unique identifier.
|
196
|
+
#
|
197
|
+
# Note that this is generated by creating an MD5 digest of the transaction
|
198
|
+
# date, raw description, type, amount and new_balance. Which means that two
|
199
|
+
# identical transactions will always produce the same +ofx_id+.
|
200
|
+
# (This is important so that repeated scrapes of the same transaction value
|
201
|
+
# produce identical ofx_id values)
|
202
|
+
#
|
203
|
+
def ofx_id()
|
204
|
+
if @ofx_id.nil?
|
205
|
+
text = "#{@date}:#{@raw_description}:#{@type}:#{@amount}:#{@new_balance}"
|
206
|
+
@ofx_id= Digest::MD5.hexdigest(text)
|
207
|
+
end
|
208
|
+
return @ofx_id
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
# Returns the description, defaulting to the +raw_description+ if no
|
213
|
+
# specific description has been set by the user.
|
214
|
+
#
|
215
|
+
def description()
|
216
|
+
@description.nil? ? raw_description : @description
|
217
|
+
end
|
218
|
+
|
219
|
+
##
|
220
|
+
# Returns the Transaction amount attribute as a ruby Float after
|
221
|
+
# replacing the decimal separator with a . and stripping any other
|
222
|
+
# separators.
|
223
|
+
#
|
224
|
+
def real_amount()
|
225
|
+
Bankjob.string_to_float(amount, @decimal)
|
226
|
+
end
|
227
|
+
|
228
|
+
##
|
229
|
+
# Returns the new balance after the transaction as a ruby Float after
|
230
|
+
# replacing the decimal separator with a . and stripping any other
|
231
|
+
# separators.
|
232
|
+
#
|
233
|
+
def real_new_balance()
|
234
|
+
Bankjob.string_to_float(new_balance, @decimal)
|
235
|
+
end
|
236
|
+
|
237
|
+
##
|
238
|
+
# Generates a string representing this Transaction as comma separated values
|
239
|
+
# in the form:
|
240
|
+
#
|
241
|
+
# <tt>date, value_date, description, real_amount, real_new_balance, amount, new_balance, raw_description, ofx_id</tt>
|
242
|
+
#
|
243
|
+
def to_csv
|
244
|
+
# if there's a payee, prepend their name to the description - otherwise skip it
|
245
|
+
if (not payee.nil? and (not payee.name.nil?))
|
246
|
+
desc = payee.name + " - " + description
|
247
|
+
else
|
248
|
+
desc = description
|
249
|
+
end
|
250
|
+
[Bankjob.date_time_to_csv(date), Bankjob.date_time_to_csv(value_date), desc, real_amount, real_new_balance, amount, new_balance, raw_description, ofx_id].to_csv
|
251
|
+
end
|
252
|
+
|
253
|
+
##
|
254
|
+
# Generates a string for use as a header in a CSV file for transactions.
|
255
|
+
# This will produce the following string:
|
256
|
+
#
|
257
|
+
# <tt>date, value_date, description, real_amount, real_new_balance, amount, new_balance, raw_description, ofx_id</tt>
|
258
|
+
#
|
259
|
+
def self.csv_header
|
260
|
+
%w{ Date Value-Date Description Amount New-Balance Raw-Amount Raw-New-Balance Raw-Description OFX-ID }.to_csv
|
261
|
+
end
|
262
|
+
|
263
|
+
##
|
264
|
+
# Creates a new Transaction from a string that defines a row in a CSV file.
|
265
|
+
#
|
266
|
+
# +csv_row+ must hold an array of values in precisely this order:
|
267
|
+
#
|
268
|
+
# <tt>date, value_date, description, real_amount, real_new_balance, amount, new_balance, raw_description, ofx_id</tt>
|
269
|
+
#
|
270
|
+
# <em>(The format should be the same as that produced by +to_csv+)</em>
|
271
|
+
#
|
272
|
+
def self.from_csv(csv_row, decimal)
|
273
|
+
if (csv_row.length != 9) # must have 9 cols
|
274
|
+
csv_lines = csv_row.join("\n\t")
|
275
|
+
msg = "Failed to create Transaction from csv row: \n\t#{csv_lines}\n"
|
276
|
+
msg << " - 9 columns are required in the form: date, value_date, "
|
277
|
+
msg << "description, real_amount, real_new_balance, amount, new_balance, "
|
278
|
+
msg << "raw_description, ofx_id"
|
279
|
+
raise msg
|
280
|
+
end
|
281
|
+
tx = Transaction.new(decimal)
|
282
|
+
tx.date, tx.value_date, tx.description = csv_row[0..2]
|
283
|
+
# skip real_amount and real_new_balance, they're read only and calculated
|
284
|
+
tx.amount, tx.new_balance, tx.raw_description, tx.ofx_id = csv_row[5..8]
|
285
|
+
return tx
|
286
|
+
end
|
287
|
+
|
288
|
+
##
|
289
|
+
# Generates an XML string adhering to the OFX standard
|
290
|
+
# (see Open Financial Exchange http://www.ofx.net)
|
291
|
+
# representing a single Transaction XML element.
|
292
|
+
#
|
293
|
+
# The OFX 2 schema defines a STMTTRN (SatementTransaction) as follows:
|
294
|
+
#
|
295
|
+
# <xsd:complexType name="StatementTransaction">
|
296
|
+
# <xsd:annotation>
|
297
|
+
# <xsd:documentation>
|
298
|
+
# The OFX element "STMTTRN" is of type "StatementTransaction"
|
299
|
+
# </xsd:documentation>
|
300
|
+
# </xsd:annotation>
|
301
|
+
# <xsd:sequence>
|
302
|
+
# <xsd:element name="TRNTYPE" type="ofx:TransactionEnum"/>
|
303
|
+
# <xsd:element name="DTPOSTED" type="ofx:DateTimeType"/>
|
304
|
+
# <xsd:element name="DTUSER" type="ofx:DateTimeType" minOccurs="0"/>
|
305
|
+
# <xsd:element name="DTAVAIL" type="ofx:DateTimeType" minOccurs="0"/>
|
306
|
+
# <xsd:element name="TRNAMT" type="ofx:AmountType"/>
|
307
|
+
# <xsd:element name="FITID" type="ofx:FinancialInstitutionTransactionIdType"/>
|
308
|
+
# <xsd:sequence minOccurs="0">
|
309
|
+
# <xsd:element name="CORRECTFITID" type="ofx:FinancialInstitutionTransactionIdType"/>
|
310
|
+
# <xsd:element name="CORRECTACTION" type="ofx:CorrectiveActionEnum"/>
|
311
|
+
# </xsd:sequence>
|
312
|
+
# <xsd:element name="SRVRTID" type="ofx:ServerIdType" minOccurs="0"/>
|
313
|
+
# <xsd:element name="CHECKNUM" type="ofx:CheckNumberType" minOccurs="0"/>
|
314
|
+
# <xsd:element name="REFNUM" type="ofx:ReferenceNumberType" minOccurs="0"/>
|
315
|
+
# <xsd:element name="SIC" type="ofx:StandardIndustryCodeType" minOccurs="0"/>
|
316
|
+
# <xsd:element name="PAYEEID" type="ofx:PayeeIdType" minOccurs="0"/>
|
317
|
+
# <xsd:choice minOccurs="0">
|
318
|
+
# <xsd:element name="NAME" type="ofx:GenericNameType"/>
|
319
|
+
# <xsd:element name="PAYEE" type="ofx:Payee"/>
|
320
|
+
# </xsd:choice>
|
321
|
+
# <xsd:choice minOccurs="0">
|
322
|
+
# <xsd:element name="BANKACCTTO" type="ofx:BankAccount"/>
|
323
|
+
# <xsd:element name="CCACCTTO" type="ofx:CreditCardAccount"/>
|
324
|
+
# </xsd:choice>
|
325
|
+
# <xsd:element name="MEMO" type="ofx:MessageType" minOccurs="0"/>
|
326
|
+
# <xsd:choice minOccurs="0">
|
327
|
+
# <xsd:element name="CURRENCY" type="ofx:Currency"/>
|
328
|
+
# <xsd:element name="ORIGCURRENCY" type="ofx:Currency"/>
|
329
|
+
# </xsd:choice>
|
330
|
+
# <xsd:element name="INV401KSOURCE" type="ofx:Investment401kSourceEnum" minOccurs="0"/>
|
331
|
+
# </xsd:sequence>
|
332
|
+
# </xsd:complexType>
|
333
|
+
#
|
334
|
+
def to_ofx
|
335
|
+
buf = ""
|
336
|
+
# Set margin=5 to indent it nicely within the output from Statement.to_ofx
|
337
|
+
x = Builder::XmlMarkup.new(:target => buf, :indent => 2, :margin=>5)
|
338
|
+
x.STMTTRN { # transaction statement
|
339
|
+
x.TRNTYPE type
|
340
|
+
x.DTPOSTED Bankjob.date_time_to_ofx(date) #Date transaction was posted to account, [datetime] yyyymmdd or yyyymmddhhmmss
|
341
|
+
x.TRNAMT amount #Ammount of transaction [amount] can be , or . separated
|
342
|
+
x.FITID ofx_id
|
343
|
+
x.CHECKNUM check_number unless check_number.nil?
|
344
|
+
buf << payee.to_ofx unless payee.nil?
|
345
|
+
#x.NAME description
|
346
|
+
x.MEMO description
|
347
|
+
}
|
348
|
+
return buf
|
349
|
+
end
|
350
|
+
|
351
|
+
##
|
352
|
+
# Produces a string representation of the transaction
|
353
|
+
#
|
354
|
+
def to_s
|
355
|
+
"#{self.class} - ofx_id: #{@ofx_id}, date:#{@date}, raw description: #{@raw_description}, type: #{@type} amount: #{@amount}, new balance: #{@new_balance}"
|
356
|
+
end
|
357
|
+
|
358
|
+
##
|
359
|
+
# Overrides == to allow comparison of Transaction objects so that they can
|
360
|
+
# be merged in Statements. See Statement#merge
|
361
|
+
#
|
362
|
+
def ==(other) #:nodoc:
|
363
|
+
if other.kind_of?(Transaction)
|
364
|
+
# sometimes the same date, when written and read back will not appear equal so convert to
|
365
|
+
# a canonical string first
|
366
|
+
return (Bankjob.date_time_to_ofx(@date) == Bankjob.date_time_to_ofx(other.date) and
|
367
|
+
# ignore value date - it may be updated between statements
|
368
|
+
# (consider using ofx_id here later)
|
369
|
+
@raw_description == other.raw_description and
|
370
|
+
@amount == other.amount and
|
371
|
+
@type == other.type and
|
372
|
+
@new_balance == other.new_balance)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
#
|
377
|
+
# Overrides eql? so that array union will work when merging statements
|
378
|
+
#
|
379
|
+
def eql?(other) #:nodoc:
|
380
|
+
return self == other
|
381
|
+
end
|
382
|
+
|
383
|
+
##
|
384
|
+
# Overrides hash so that array union will work when merging statements
|
385
|
+
#
|
386
|
+
def hash() #:nodoc:
|
387
|
+
prime = 31;
|
388
|
+
result = 1;
|
389
|
+
result = prime * result + @amount.to_i
|
390
|
+
result = prime * result + @new_balance.to_i
|
391
|
+
result = prime * result + (@date.nil? ? 0 : Bankjob.date_time_to_ofx(@date).hash);
|
392
|
+
result = prime * result + (@raw_description.nil? ? 0 : @raw_description.hash);
|
393
|
+
result = prime * result + (@type.nil? ? 0 : @type.hash);
|
394
|
+
# don't use value date
|
395
|
+
return result;
|
396
|
+
end
|
397
|
+
|
398
|
+
end # class Transaction
|
399
|
+
end # module
|
400
|
+
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'hpricot'
|
4
|
+
require 'bankjob'
|
5
|
+
|
6
|
+
# Later versions of Mechanize no longer use Hpricot by default
|
7
|
+
# but have an attribute we can set to use it
|
8
|
+
begin
|
9
|
+
WWW::Mechanize.html_parser = Hpricot
|
10
|
+
rescue NoMethodError
|
11
|
+
end
|
12
|
+
|
13
|
+
include Bankjob
|
14
|
+
|
15
|
+
##
|
16
|
+
# BaseScraper is a specific example of a Bankjob Scraper that can be used as a base
|
17
|
+
# class for scrapers that follow a typical pattern.
|
18
|
+
#
|
19
|
+
# In fact, it does not add much functionality and you could just as readily subclass
|
20
|
+
# the Scraper class as this class, but here is what it does add:
|
21
|
+
# *+scraper_args+ attribute holds the array of args specified by the -scraper_args command line option
|
22
|
+
# *+scrape_statement+ is implemented to use the --input command line option to specify a file for input
|
23
|
+
# so that you can save a web-page to a file for debugging
|
24
|
+
# *+scrape_statement+ instantiates a Mechanize agent and delegates to two other
|
25
|
+
# simple methods that must be overridden in a subclass.
|
26
|
+
#
|
27
|
+
# Specifically +scrape_statement+ passes the Mechanize agent to +fetch_transactions_page+
|
28
|
+
# then passes the resulting page to +parse_transactions_page. Subclasses must implement these two methods.
|
29
|
+
# See the documentation for these methods for more details on how to implement them.
|
30
|
+
# Note that failure to override either method will result in an exception.
|
31
|
+
#
|
32
|
+
class BaseScraper < Scraper
|
33
|
+
|
34
|
+
# +scraper_args+ holds the array of arguments specified on the command line with
|
35
|
+
# the -scraper_args option. It is not used here, but it is set in the scrape_statement
|
36
|
+
# method so that you can access it in your subclass.
|
37
|
+
attr_accessor :scraper_args
|
38
|
+
|
39
|
+
# This rule goes last and sets the type of any transactions
|
40
|
+
# that are still set to OTHER to be the generic CREDIT or DEBIT
|
41
|
+
# depending on the real amount of the transaction
|
42
|
+
# +prioirity+ set to -999 to ensure it's last
|
43
|
+
transaction_rule(-999) do |tx|
|
44
|
+
if (tx.type == Transaction::OTHER)
|
45
|
+
if tx.real_amount > 0
|
46
|
+
tx.type = Transaction::CREDIT
|
47
|
+
elsif tx.real_amount < 0
|
48
|
+
tx.type = Transaction::DEBIT
|
49
|
+
end
|
50
|
+
# else leave it as OTHER if it's exactly zero
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Override +fetch_transactions_page+ to use the mechanize +agent+ to
|
56
|
+
# load the page holding your bank statement on your online banking website.
|
57
|
+
# By using agent.get(url) to fetch the page, the returned page will be
|
58
|
+
# an Hpricot document ready for parsing.
|
59
|
+
#
|
60
|
+
# Typically you will need to log-in using a form on a login page first.
|
61
|
+
# Your implementation may look something like this:
|
62
|
+
#
|
63
|
+
# # My online banking app has a logon page with a standard HTML form.
|
64
|
+
# # by looking at the source of the page I see that the form is named
|
65
|
+
# # 'MyLoginFormName' and the two text fields for user name and password
|
66
|
+
# # are called 'USERNAME' and 'PASSWORD' respectively.
|
67
|
+
# login_page = agent.get("http://mybankapp.com/login.html")
|
68
|
+
# form = login_page.forms.name('MyLoginFormName').first
|
69
|
+
# # Mechanize automatically makes constants for the form elements based on their names.
|
70
|
+
# form.USERNAME = "me"
|
71
|
+
# form.PASSWORD = "foo"
|
72
|
+
# agent.submit(form)
|
73
|
+
# sleep 3 #wait while the login takes effect
|
74
|
+
#
|
75
|
+
# # Now that I've logged in and waited a bit, navigate to the page that lists
|
76
|
+
# # my recent transactions and return it
|
77
|
+
# return agent.get("http://mybankapp.com/latesttransactions.html")
|
78
|
+
#
|
79
|
+
def fetch_transactions_page(agent)
|
80
|
+
raise "You must override fetch_transactions_page in your subclass of BaseScraper " +
|
81
|
+
"or just subclass Scraper instead and override scrape_statement"
|
82
|
+
end
|
83
|
+
|
84
|
+
##
|
85
|
+
# Override +parse_transactions_page+ to take the Hpricot document passed in
|
86
|
+
# as +page+, parse it using Hpricot directives, and create a Statement object
|
87
|
+
# holding a set of Transaction objects for it.
|
88
|
+
#
|
89
|
+
def parse_transactions_page(page)
|
90
|
+
raise "You must override parse_transactions_page in your subclass of BaseScraper " +
|
91
|
+
"or just subclass Scraper instead and override scrape_statement"
|
92
|
+
end
|
93
|
+
|
94
|
+
##
|
95
|
+
# Implements the one essential method of a scraper +scrape_statement+
|
96
|
+
# by calling +fetch_transactions_page+ to get a web page holding a bank
|
97
|
+
# statement followed by a call to +parse_transactions_page+ that returns
|
98
|
+
# the +Statement+ object.
|
99
|
+
#
|
100
|
+
# Do not override this method in a subclass. (If you want to override it
|
101
|
+
# you should be subclassing Scraper instead of this class)
|
102
|
+
#
|
103
|
+
# If the --input argument has been used to specify and input html file to
|
104
|
+
# use, this will be parsed directly instead of calling +fetch_transaction_page+.
|
105
|
+
# This allows for easy debugging without slow web-scraping (simply view
|
106
|
+
# the page in a regular browser and use Save Page As to save a local copy
|
107
|
+
# of it, then specify thiswith the --input command-line arg)
|
108
|
+
#
|
109
|
+
# +args+ holds the array of arguments specified on the command line with
|
110
|
+
# the -scraper_args option. It is not used here, but it is set on an
|
111
|
+
# attribute called scraper_args and is thus accessible in your subclass.
|
112
|
+
#
|
113
|
+
def scrape_statement(args)
|
114
|
+
self.scraper_args = args
|
115
|
+
if (not options.input.nil?) then
|
116
|
+
# used for debugging - load the page from a file instead of the web
|
117
|
+
logger.debug("Reading debug input html from #{options.input} instead of scraping the real website.")
|
118
|
+
page = Hpricot(open(options.input))
|
119
|
+
else
|
120
|
+
# not debugging use the actual scraper
|
121
|
+
# First create a mechanize agent: a sort of pretend web browser
|
122
|
+
agent = WWW::Mechanize.new
|
123
|
+
agent.user_agent_alias = 'Windows IE 6' # pretend that we're IE 6.0
|
124
|
+
|
125
|
+
page = fetch_transactions_page(agent)
|
126
|
+
end
|
127
|
+
raise "BaseScraper failed to load the transactions page" if page.nil?
|
128
|
+
# Now that we've feteched the page, parse it to get a statement
|
129
|
+
statement = parse_transactions_page(page)
|
130
|
+
return statement
|
131
|
+
end
|
132
|
+
end # BaseScraper
|
133
|
+
|