pidgin2adium 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,125 @@
1
+ # HtmlLogParser class, a subclass of BasicParser.
2
+ # Used for parse()ing HTML logs.
3
+
4
+ require 'balance_tags_c'
5
+
6
+ module Pidgin2Adium
7
+ class HtmlLogParser < BasicParser
8
+ def initialize(src_path, user_aliases)
9
+ super(src_path, user_aliases)
10
+ @timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'
11
+
12
+ # @line_regex matches a line in an HTML log file other than the
13
+ # first time matches on either "2008-11-17 14:12" or "14:12"
14
+ # @line_regex match obj:
15
+ # 0: timestamp, extended or not
16
+ # 1: screen name or alias, if alias set
17
+ # 2: "&lt;AUTO-REPLY&gt;" or nil
18
+ # 3: message body
19
+ # The ":" is optional to allow for strings like "(17:12:21) <b>***Gabe B-W</b> is confused<br/>"
20
+ @line_regex = /#{@timestamp_rx} ?<b>(.+?) ?(&lt;AUTO-REPLY&gt;)?:?<\/b> ?(.+)<br ?\/>/o
21
+ # @line_regex_status matches a status line
22
+ # @line_regex_status match obj:
23
+ # 0: timestamp
24
+ # 1: status message
25
+ @line_regex_status = /#{@timestamp_rx} ?<b> (.+)<\/b><br ?\/>/o
26
+ end
27
+
28
+ # Returns a cleaned string.
29
+ # Removes the following tags from _text_:
30
+ # * html
31
+ # * body
32
+ # * font
33
+ # * a with no innertext, e.g. <a href="blah"></a>
34
+ # And removes the following style declarations:
35
+ # * color: #000000 (just turns text black)
36
+ # * font-family
37
+ # * font-size
38
+ # * background
39
+ # * em (really it's changed to <span style="font-style: italic;">)
40
+ # Since each <span> has only one style declaration, spans with these
41
+ # declarations are removed (but the text inside them is preserved).
42
+ def cleanup(text)
43
+ # Sometimes this is in there. I don't know why.
44
+ text.gsub!(%r{&lt;/FONT HSPACE='\d'>}, '')
45
+ # We can remove <font> safely since Pidgin and Adium both show bold
46
+ # using <span style="font-weight: bold;"> except Pidgin uses single
47
+ # quotes while Adium uses double quotes.
48
+ text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!
49
+
50
+ text.tr!("\r", '')
51
+ # Remove empty lines
52
+ text.gsub!("\n\n", "\n")
53
+
54
+ # Remove newlines that end the file, since they screw up the
55
+ # newline -> <br/> conversion
56
+ text.gsub!(/\n\Z/, '')
57
+
58
+ # Replace newlines with "<br/>" unless they end a chat line.
59
+ # This must go after we remove <font> tags.
60
+ text.gsub!(/\n(?!#{@timestamp_rx})/, '<br/>')
61
+
62
+ # These empty links are sometimes appended to every line in a chat,
63
+ # for some weird reason. Remove them.
64
+ text.gsub!(%r{<a href=['"].+?['"]>\s*?</a>}, '')
65
+
66
+ # Replace single quotes inside tags with double quotes so we can
67
+ # easily change single quotes to entities.
68
+ # For spans, removes a space after the final declaration if it exists.
69
+ text.gsub!(/<span style='([^']+?;) ?'>/, '<span style="\1">')
70
+ text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
71
+ =begin
72
+ text.gsub!(/<a href='(.+?)'>/, '<a href="\1">')
73
+ text.gsub!(/<img src='([^']+?)'/, '<img src="\1"')
74
+ text.gsub!(/ alt='([^']+?)'/, ' alt="\1"')
75
+ =end
76
+ text.gsub!("'", '&apos;')
77
+
78
+ # This actually does match stuff, but doesn't group it correctly. :(
79
+ # text.gsub!(%r{<span style="((?:.+?;)+)">(.*?)</span>}) do |s|
80
+ text.gsub!(%r{<span style="(.+?)">(.*?)</span>}) do |s|
81
+ # Remove empty spans.
82
+ next if $2 == ''
83
+
84
+ # style = style declaration
85
+ # innertext = text inside <span>
86
+ style, innertext = $1, $2
87
+ # TODO: replace double quotes with "&quot;", but only outside tags; may still be tags inside spans
88
+ # innertext.gsub!("")
89
+
90
+ styleparts = style.split(/; ?/)
91
+ styleparts.map! do |p|
92
+ if p[0,5] == 'color'
93
+ if p.include?('color: #000000')
94
+ next
95
+ elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
96
+ # Regarding the bit with the ">", sometimes this happens:
97
+ # <span style="color: #000000>today;">today was busy</span>
98
+ # Then p = "color: #000000>today"
99
+ # Or it can end in ">;", with no text before the semicolon.
100
+ # So keep the color but remove the ">" and anything following it.
101
+ next($1)
102
+ end
103
+ else
104
+ # don't remove font-weight
105
+ case p
106
+ when /^font-family/ then next
107
+ when /^font-size/ then next
108
+ when /^background/ then next
109
+ end
110
+ end
111
+ end.compact!
112
+ unless styleparts.empty?
113
+ style = styleparts.join('; ')
114
+ innertext = "<span style=\"#{style};\">#{innertext}</span>"
115
+ end
116
+ innertext
117
+ end
118
+ # Pidgin uses <em>, Adium uses <span>
119
+ if text.gsub!('<em>', '<span style="font-style: italic;">')
120
+ text.gsub!('</em>', '</span>')
121
+ end
122
+ return text
123
+ end
124
+ end # END HtmlLogParser class
125
+ end
@@ -20,8 +20,12 @@ module Pidgin2Adium
20
20
  @my_aliases = aliases
21
21
 
22
22
  unless File.directory?(@pidgin_log_dir)
23
- puts "Source directory #{@pidgin_log_dir} does not exist or is not a directory."
24
- raise Errno::ENOENT
23
+ msg = "Source directory #{@pidgin_log_dir} does not exist or is not a directory."
24
+ error(msg)
25
+
26
+ # ENOENT automatically prepends "No such file or directory - " to
27
+ # its initializer's arguments
28
+ raise Errno::ENOENT.new("source directory #{@pidgin_log_dir}")
25
29
  end
26
30
  end
27
31
 
@@ -31,11 +35,11 @@ module Pidgin2Adium
31
35
  def start
32
36
  log_msg "Begin converting."
33
37
  begin
34
- files_path = get_all_chat_files(@pidgin_log_dir)
38
+ files_path = get_all_chat_files()
35
39
  rescue Errno::EACCES => bang
36
40
  error("Sorry, permission denied for getting Pidgin chat files from #{@pidgin_log_dir}.")
37
41
  error("Details: #{bang.message}")
38
- raise Errno::EACCES
42
+ raise bang
39
43
  end
40
44
 
41
45
  total_files = files_path.size
@@ -52,21 +56,16 @@ module Pidgin2Adium
52
56
 
53
57
  delete_search_indexes()
54
58
 
55
- log_msg "Finished converting! Converted #{total_successes} files of #{total_files} total."
59
+ Pidgin2Adium.log_msg "Finished converting! Converted #{total_successes} files of #{total_files} total."
56
60
  puts "Minor error messages:"
57
61
  puts @@oops_messages.join("\n")
58
62
  puts "Major error messages:"
59
63
  puts @@error_messages.join("\n")
60
64
  end
61
65
 
62
- ###########
63
- private
64
- ###########
65
-
66
- def get_all_chat_files(dir)
67
- return [] if File.basename(dir) == ".system"
66
+ def get_all_chat_files
68
67
  # recurse into each subdir
69
- return (Dir.glob("#{@pidgin_log_dir}/**/*.{htm,html,txt}") - BAD_DIRS)
68
+ Dir.glob("#{@pidgin_log_dir}/**/*.{htm,html,txt}") - BAD_DIRS
70
69
  end
71
- end # END LogConverter class
70
+ end
72
71
  end
@@ -98,5 +98,5 @@ module Pidgin2Adium
98
98
 
99
99
  return output_path
100
100
  end
101
- end # END LogFile class
101
+ end
102
102
  end
@@ -1,618 +1,3 @@
1
- # Contains the class BasicParser and its subclasses, HtmlLogParser and
2
- # TextFileParser, which parse the file passed into it and return a LogFile
3
- # object.
4
- #
5
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
6
- # using these classes directly.
7
- require 'parsedate'
8
- require 'time' # for Time.zone_offset
9
-
10
- require 'balance_tags_c'
11
- require 'pidgin2adium/log_file'
12
-
13
- module Pidgin2Adium
14
- # Empty class. Raise'd by LogParser if the first line of a log is not
15
- # parseable.
16
- class InvalidFirstLineError < StandardError; end
17
-
18
- # BasicParser is a base class. Its subclasses are TextLogParser and
19
- # HtmlLogParser.
20
- #
21
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
22
- # using this class directly.
23
- class BasicParser
24
- include Pidgin2Adium
25
- def initialize(src_path, user_aliases)
26
- @src_path = src_path
27
- # Whitespace is removed for easy matching later on.
28
- @user_aliases = user_aliases.split(',').map!{|x| x.downcase.gsub(/\s+/,'') }.uniq
29
- # @user_alias is set each time get_sender_by_alias is called. It is a non-normalized
30
- # alias.
31
- # Set an initial value just in case the first message doesn't give
32
- # us an alias.
33
- @user_alias = user_aliases.split(',')[0]
34
-
35
- @tz_offset = get_time_zone_offset()
36
-
37
- file = File.new(@src_path, 'r')
38
- @first_line = file.readline
39
- @file_content = file.read
40
- file.close
41
-
42
- # Time regexes must be set before pre_parse().
43
- # "4/18/2007 11:02:00 AM" => %w{4, 18, 2007, 11, 02, 00, AM}
44
- # ONLY used (if at all) in first line of chat ("Conversation with...at...")
45
- @time_regex_first_line = %r{^(\d{1,2})/(\d{1,2})/(\d{4}) (\d{1,2}):(\d{2}):(\d{2}) ([AP]M)$}
46
- # "2007-04-17 12:33:13" => %w{2007, 04, 17, 12, 33, 13}
47
- @time_regex = /^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})$/
48
- # sometimes a line in a chat doesn't have a full timestamp
49
- # "04:22:05 AM" => %w{04 22 05 AM}
50
- @minimal_time_regex = /^(\d{1,2}):(\d{2}):(\d{2})( [AP]M)?$/
51
-
52
- # Whether or not the first line is parseable.
53
- @first_line_is_valid = true
54
- begin
55
- @service,
56
- @user_SN,
57
- @partner_SN,
58
- # @basic_time_info is for files that only have the full
59
- # timestamp at the top; we can use it to fill in the minimal
60
- # per-line timestamps. It has only 3 elements (year, month,
61
- # dayofmonth) because you should be able to fill everything
62
- # else in. If you can't, something's wrong.
63
- @basic_time_info,
64
- # When the chat started, in Adium's format
65
- @adium_chat_time_start = pre_parse()
66
- rescue InvalidFirstLineError
67
- @first_line_is_valid = false
68
- error("Failed to parse, invalid first line: #{@src_path}")
69
- return # stop processing
70
- end
71
-
72
- # @status_map, @lib_purple_events, and @events are used in
73
- # create_status_or_event_msg
74
- @status_map = {
75
- /(.+) logged in\.$/ => 'online',
76
- /(.+) logged out\.$/ => 'offline',
77
- /(.+) has signed on\.$/ => 'online',
78
- /(.+) has signed off\.$/ => 'offline',
79
- /(.+) has gone away\.$/ => 'away',
80
- /(.+) is no longer away\.$/ => 'available',
81
- /(.+) has become idle\.$/ => 'idle',
82
- /(.+) is no longer idle\.$/ => 'available'
83
- }
84
-
85
- # lib_purple_events are all of event_type libPurple
86
- @lib_purple_events = [
87
- # file transfer
88
- /Starting transfer of .+ from (.+)/,
89
- /^Offering to send .+ to (.+)$/,
90
- /(.+) is offering to send file/,
91
- /^Transfer of file .+ complete$/,
92
- /Error reading|writing|accessing .+: .+/,
93
- /You cancell?ed the transfer of/,
94
- /File transfer cancelled/,
95
- /(.+?) cancell?ed the transfer of/,
96
- /(.+?) cancelled the file transfer/,
97
- # Direct IM - actual (dis)connect events are their own types
98
- /^Attempting to connect to (.+) at .+ for Direct IM\./,
99
- /^Asking (.+) to connect to us at .+ for Direct IM\./,
100
- /^Attempting to connect via proxy server\.$/,
101
- /^Direct IM with (.+) failed/,
102
- # encryption
103
- /Received message encrypted with wrong key/,
104
- /^Requesting key\.\.\.$/,
105
- /^Outgoing message lost\.$/,
106
- /^Conflicting Key Received!$/,
107
- /^Error in decryption- asking for resend\.\.\.$/,
108
- /^Making new key pair\.\.\.$/,
109
- # sending errors
110
- /^Last outgoing message not received properly- resetting$/,
111
- /Resending\.\.\./,
112
- # connection errors
113
- /Lost connection with the remote user:.+/,
114
- # chats
115
- /^.+ entered the room\.$/,
116
- /^.+ left the room\.$/
117
- ]
118
-
119
- # non-libpurple events
120
- # Each key maps to an event_type string. The keys will be matched against a line of chat
121
- # and the partner's alias will be in regex group 1, IF the alias is matched.
122
- @event_map = {
123
- # .+ is not an alias, it's a proxy server so no grouping
124
- /^Attempting to connect to .+\.$/ => 'direct-im-connect',
125
- # NB: pidgin doesn't track when Direct IM is disconnected, AFAIK
126
- /^Direct IM established$/ => 'directIMConnected',
127
- /Unable to send message/ => 'chat-error',
128
- /You missed .+ messages from (.+) because they were too large/ => 'chat-error',
129
- /User information not available/ => 'chat-error'
130
- }
131
-
132
- @ignore_events = [
133
- # Adium ignores SN/alias changes.
134
- /^.+? is now known as .+?\.<br\/?>$/
135
- ]
136
- end
137
-
138
- # This method returns a LogFile instance, or false if an error occurred.
139
- def parse
140
- return false unless @first_line_is_valid
141
- @file_content = cleanup(@file_content).split("\n")
142
-
143
- @file_content.map! do |line|
144
- # "next" returns nil which is removed by compact
145
- next if line =~ /^\s+$/
146
- if line =~ @line_regex
147
- create_msg($~.captures)
148
- elsif line =~ @line_regex_status
149
- msg = create_status_or_event_msg($~.captures)
150
- # Error occurred while parsing
151
- return false if msg == false
152
- else
153
- error "Could not parse line:"
154
- p line
155
- return false
156
- end
157
- end
158
- @file_content.compact!
159
- return LogFile.new(@file_content, @service, @user_SN, @partner_SN, @adium_chat_time_start)
160
- end
161
- # Prevent parse from being called directly from BasicParser, since
162
- # it uses subclassing magic.
163
- protected :parse
164
-
165
- #################
166
- private
167
- #################
168
-
169
- def get_time_zone_offset()
170
- # We must have a tz_offset or else the Adium Chat Log viewer
171
- # doesn't read the date correctly and then:
172
- # 1) the log has an empty start date column in the viewer
173
- # 2) The timestamps are all the same for the whole log
174
- tz_match = /([-\+]\d+)[A-Z]{3}\.(?:txt|htm|html)/.match(@src_path)
175
- if tz_match and tz_match[1]
176
- tz_offset = tz_match[1]
177
- else
178
- # "-0500" (3d rather than 2d to allow for "+")
179
- tz_offset = sprintf('%+03d00', Time.zone_offset(Time.now.zone) / 3600)
180
- end
181
- return tz_offset
182
- end
183
-
184
- #--
185
- # Adium time format: YYYY-MM-DD\THH:MM:SS[+-]TZ_HRS like:
186
- # 2008-10-05T22:26:20-0800
187
- # HOWEVER:
188
- # If it's the first line, then return it like this (note periods):
189
- # 2008-10-05T22.26.20-0800
190
- # because it will be used in the filename.
191
- #++
192
- # Converts a pidgin datestamp to an Adium one.
193
- def create_adium_time(time, is_first_line = false)
194
- # parsed_date = [year, month, day, hour, min, sec]
195
- if time =~ @time_regex
196
- year, month, day, hour, min, sec = $1.to_i,
197
- $2.to_i,
198
- $3.to_i,
199
- $4.to_i,
200
- $5.to_i,
201
- $6.to_i
202
- elsif is_first_line and time =~ @time_regex_first_line
203
- hour = $4.to_i
204
- if $7 == 'PM' and hour != 12
205
- hour += 12
206
- end
207
- year, month, day, min, sec = $3.to_i, # year
208
- $1.to_i, # month
209
- $2.to_i, # day
210
- # already did hour
211
- $5.to_i, # minutes
212
- $6.to_i # seconds
213
- elsif time =~ @minimal_time_regex
214
- # "04:22:05" => %w{04 22 05}
215
- hour = $1.to_i
216
- if $4 == 'PM' and hour != 12
217
- hour += 12
218
- end
219
- year, month, day = @basic_time_info
220
- min = $2.to_i
221
- sec = $3.to_i
222
- else
223
- error("You have found an odd timestamp. Please report it to the developer.")
224
- log_msg("The timestamp: #{time}")
225
- log_msg("Continuing...")
226
- year,month,day,hour,min,sec = ParseDate.parsedate(time)
227
- end
228
- if is_first_line
229
- adium_time = Time.local(year,month,day,hour,min,sec).strftime("%Y-%m-%dT%H.%M.%S#{@tz_offset}")
230
- else
231
- adium_time = Time.local(year,month,day,hour,min,sec).strftime("%Y-%m-%dT%H:%M:%S#{@tz_offset}")
232
- end
233
- return adium_time
234
- end
235
-
236
- # Extract required data from the file. Run by parse.
237
- def pre_parse
238
- # Deal with first line.
239
-
240
- # the first line is special. It tells us (in order of regex groups):
241
- # 1) who we're talking to
242
- # 2) what time/date
243
- # 3) what SN we used
244
- # 4) what protocol (AIM, icq, jabber...)
245
- first_line_match = /Conversation with (.+?) at (.+?) on (.+?) \((.+?)\)/.match(@first_line)
246
- if first_line_match.nil?
247
- raise InvalidFirstLineError
248
- else
249
- service = first_line_match[4]
250
- # @user_SN is normalized to avoid "AIM.name" and "AIM.na me" folders
251
- user_SN = first_line_match[3].downcase.tr(' ', '')
252
- partner_SN = first_line_match[1]
253
- pidgin_chat_time_start = first_line_match[2]
254
- basic_time_info = case pidgin_chat_time_start
255
- when @time_regex then [$1.to_i, $2.to_i, $3.to_i]
256
- when @time_regex_first_line then [$3.to_i, $1.to_i, $2.to_i]
257
- end
258
- adium_chat_time_start = create_adium_time(pidgin_chat_time_start, true)
259
- return [service,
260
- user_SN,
261
- partner_SN,
262
- basic_time_info,
263
- adium_chat_time_start]
264
- end
265
- end
266
-
267
- def get_sender_by_alias(alias_name)
268
- no_action = alias_name.sub(/^\*{3}/, '')
269
- if @user_aliases.include? no_action.downcase.gsub(/\s+/, '')
270
- # Set the current alias being used of the ones in @user_aliases
271
- @user_alias = no_action
272
- return @user_SN
273
- else
274
- return @partner_SN
275
- end
276
- end
277
-
278
- #--
279
- # create_msg takes an array of captures from matching against
280
- # @line_regex and returns a Message object or one of its subclasses.
281
- # It can be used for TextLogParser and HtmlLogParser because both of
282
- # them return data in the same indexes in the matches array.
283
- #++
284
- def create_msg(matches)
285
- msg = nil
286
- # Either a regular message line or an auto-reply/away message.
287
- time = create_adium_time(matches[0])
288
- buddy_alias = matches[1]
289
- sender = get_sender_by_alias(buddy_alias)
290
- body = matches[3]
291
- if matches[2] # auto-reply
292
- msg = AutoReplyMessage.new(sender, time, buddy_alias, body)
293
- else
294
- # normal message
295
- msg = XMLMessage.new(sender, time, buddy_alias, body)
296
- end
297
- return msg
298
- end
299
-
300
- #--
301
- # create_status_or_event_msg takes an array of +MatchData+ captures from
302
- # matching against @line_regex_status and returns an Event or Status.
303
- # Returns nil if it's a message that should be ignored, or false if an
304
- # error occurred.
305
- #++
306
- def create_status_or_event_msg(matches)
307
- # ["22:58:00", "BuddyName logged in."]
308
- # 0: time
309
- # 1: status message or event
310
- msg = nil
311
- time = create_adium_time(matches[0])
312
- str = matches[1]
313
- # Return nil, which will get compact'ed out
314
- return nil if @ignore_events.detect{|regex| str =~ regex }
315
-
316
- regex, status = @status_map.detect{|regex, status| str =~ regex}
317
- if regex and status
318
- # Status message
319
- buddy_alias = regex.match(str)[1]
320
- sender = get_sender_by_alias(buddy_alias)
321
- msg = StatusMessage.new(sender, time, buddy_alias, status)
322
- else
323
- # Test for event
324
- regex = @lib_purple_events.detect{|regex| str =~ regex }
325
- event_type = 'libpurpleEvent' if regex
326
- unless regex and event_type
327
- # not a libpurple event, try others
328
- if @event_map.detect{|regex,event_type| str =~ regex}
329
- regex, event_type = $1, $2
330
- else
331
- error(sprintf("Error parsing status or event message, no status or event found: %p", str))
332
- return false
333
- end
334
- end
335
- if regex and event_type
336
- regex_matches = regex.match(str)
337
- # Event message
338
- if regex_matches.size == 1
339
- # No alias - this means it's the user
340
- buddy_alias = @user_alias
341
- sender = @user_SN
342
- else
343
- buddy_alias = regex_matches[1]
344
- sender = get_sender_by_alias(buddy_alias)
345
- end
346
- msg = Event.new(sender, time, buddy_alias, str, event_type)
347
- end
348
- end
349
- return msg
350
- end
351
- end # END BasicParser class
352
-
353
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
354
- # using this class directly.
355
- class TextLogParser < BasicParser
356
- def initialize(src_path, user_aliases)
357
- super(src_path, user_aliases)
358
- @timestamp_rx = '\((\d{1,2}:\d{1,2}:\d{1,2})\)'
359
-
360
- # @line_regex matches a line in a TXT log file other than the first
361
- # @line_regex matchdata:
362
- # 0: timestamp
363
- # 1: screen name or alias, if alias set
364
- # 2: "<AUTO-REPLY>" or nil
365
- # 3: message body
366
- @line_regex = /#{@timestamp_rx} (.*?) ?(<AUTO-REPLY>)?: (.*)/o
367
-
368
- # @line_regex_status matches a status line
369
- # @line_regex_status matchdata:
370
- # 0: timestamp
371
- # 1: status message
372
- @line_regex_status = /#{@timestamp_rx} ([^:]+)/o
373
- end
374
-
375
- public :parse
376
-
377
- #################
378
- private
379
- #################
380
-
381
- def cleanup(text)
382
- text.tr!("\r", '')
383
- # Replace newlines with "<br/>" unless they end a chat line.
384
- text.gsub!(/\n(?!#{@timestamp_rx}|\Z)/, '<br/>')
385
- # Escape entities since this will be in XML
386
- text.gsub!('&', '&amp;') # escape '&' first
387
- text.gsub!('<', '&lt;')
388
- text.gsub!('>', '&gt;')
389
- text.gsub!('"', '&quot;')
390
- text.gsub!("'", '&apos;')
391
- return text
392
- end
393
- end
394
-
395
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead
396
- # of using this class directly.
397
- class HtmlLogParser < BasicParser
398
- def initialize(src_path, user_aliases)
399
- super(src_path, user_aliases)
400
- @timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'
401
-
402
- # @line_regex matches a line in an HTML log file other than the
403
- # first time matches on either "2008-11-17 14:12" or "14:12"
404
- # @line_regex match obj:
405
- # 0: timestamp, extended or not
406
- # 1: screen name or alias, if alias set
407
- # 2: "&lt;AUTO-REPLY&gt;" or nil
408
- # 3: message body
409
- # The ":" is optional to allow for strings like "(17:12:21) <b>***Gabe B-W</b> is confused<br/>"
410
- @line_regex = /#{@timestamp_rx} ?<b>(.+?) ?(&lt;AUTO-REPLY&gt;)?:?<\/b> ?(.+)<br ?\/>/o
411
- # @line_regex_status matches a status line
412
- # @line_regex_status match obj:
413
- # 0: timestamp
414
- # 1: status message
415
- @line_regex_status = /#{@timestamp_rx} ?<b> (.+)<\/b><br ?\/>/o
416
- end
417
-
418
- public :parse
419
-
420
- #################
421
- private
422
- #################
423
-
424
- # Returns a cleaned string.
425
- # Removes the following tags from _text_:
426
- # * html
427
- # * body
428
- # * font
429
- # * a with no innertext, e.g. <a href="blah"></a>
430
- # And removes the following style declarations:
431
- # * color: #000000 (just turns text black)
432
- # * font-family
433
- # * font-size
434
- # * background
435
- # * em (really it's changed to <span style="font-style: italic;">)
436
- # Since each <span> has only one style declaration, spans with these
437
- # declarations are removed (but the text inside them is preserved).
438
- def cleanup(text)
439
- # Sometimes this is in there. I don't know why.
440
- text.gsub!(%r{&lt;/FONT HSPACE='\d'>}, '')
441
- # We can remove <font> safely since Pidgin and Adium both show bold
442
- # using <span style="font-weight: bold;"> except Pidgin uses single
443
- # quotes while Adium uses double quotes.
444
- text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!
445
-
446
- text.tr!("\r", '')
447
- # Remove empty lines
448
- text.gsub!("\n\n", "\n")
449
-
450
- # Remove newlines that end the file, since they screw up the
451
- # newline -> <br/> conversion
452
- text.gsub!(/\n\Z/, '')
453
-
454
- # Replace newlines with "<br/>" unless they end a chat line.
455
- # This must go after we remove <font> tags.
456
- text.gsub!(/\n(?!#{@timestamp_rx})/, '<br/>')
457
-
458
- # These empty links are sometimes appended to every line in a chat,
459
- # for some weird reason. Remove them.
460
- text.gsub!(%r{<a href=('").+?\1>\s*?</a>}, '')
461
-
462
- # Replace single quotes inside tags with double quotes so we can
463
- # easily change single quotes to entities.
464
- # For spans, removes a space after the final declaration if it exists.
465
- text.gsub!(/<span style='([^']+?;) ?'>/, '<span style="\1">')
466
- text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
467
- =begin
468
- text.gsub!(/<a href='(.+?)'>/, '<a href="\1">')
469
- text.gsub!(/<img src='([^']+?)'/, '<img src="\1"')
470
- text.gsub!(/ alt='([^']+?)'/, ' alt="\1"')
471
- =end
472
- text.gsub!("'", '&apos;')
473
-
474
- # This actually does match stuff, but doesn't group it correctly. :(
475
- # text.gsub!(%r{<span style="((?:.+?;)+)">(.*?)</span>}) do |s|
476
- text.gsub!(%r{<span style="(.+?)">(.*?)</span>}) do |s|
477
- # Remove empty spans.
478
- next if $2 == ''
479
-
480
- # style = style declaration
481
- # innertext = text inside <span>
482
- style, innertext = $1, $2
483
- # TODO: replace double quotes with "&quot;", but only outside tags; may still be tags inside spans
484
- # innertext.gsub!("")
485
-
486
- styleparts = style.split(/; ?/)
487
- styleparts.map! do |p|
488
- if p[0,5] == 'color'
489
- if p.include?('color: #000000')
490
- next
491
- elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
492
- # Regarding the bit with the ">", sometimes this happens:
493
- # <span style="color: #000000>today;">today was busy</span>
494
- # Then p = "color: #000000>today"
495
- # Or it can end in ">;", with no text before the semicolon.
496
- # So keep the color but remove the ">" and anything following it.
497
- next($1)
498
- end
499
- else
500
- # don't remove font-weight
501
- case p
502
- when /^font-family/ then next
503
- when /^font-size/ then next
504
- when /^background/ then next
505
- end
506
- end
507
- end.compact!
508
- unless styleparts.empty?
509
- style = styleparts.join('; ')
510
- innertext = "<span style=\"#{style};\">#{innertext}</span>"
511
- end
512
- innertext
513
- end
514
- # Pidgin uses <em>, Adium uses <span>
515
- if text.gsub!('<em>', '<span style="font-style: italic;">')
516
- text.gsub!('</em>', '</span>')
517
- end
518
- return text
519
- end
520
- end # END HtmlLogParser class
521
-
522
- # A holding object for each line of the chat. It is subclassed as
523
- # appropriate (eg AutoReplyMessage). Each subclass (but not Message
524
- # itself) has its own to_s which prints out its information in a format
525
- # appropriate for putting in an Adium log file.
526
- # Subclasses: XMLMessage, AutoReplyMessage, StatusMessage, Event.
527
- class Message
528
- def initialize(sender, time, buddy_alias)
529
- # The sender's screen name
530
- @sender = sender
531
- # The time the message was sent, in Adium format (e.g.
532
- # "2008-10-05T22:26:20-0800")
533
- @time = time
534
- # The receiver's alias (NOT screen name)
535
- @buddy_alias = buddy_alias
536
- end
537
- attr_accessor :sender, :time, :buddy_alias
538
- end
539
-
540
- # Basic message with body text (as opposed to pure status messages, which
541
- # have no body).
542
- class XMLMessage < Message
543
- def initialize(sender, time, buddy_alias, body)
544
- super(sender, time, buddy_alias)
545
- @body = body
546
- @styled_body = '<div><span style="font-family: Helvetica; font-size: 12pt;">%s</span></div>' % @body
547
- normalize_body!()
548
- end
549
- attr_accessor :body
550
-
551
- def to_s
552
- return sprintf('<message sender="%s" time="%s" alias="%s">%s</message>' << "\n",
553
- @sender, @time, @buddy_alias, @styled_body)
554
- end
555
-
556
- #################
557
- private
558
- #################
559
-
560
- # Balances mismatched tags, normalizes body style, and fixes actions
561
- # so they are in Adium style (Pidgin uses "***Buddy waves at you", Adium uses
562
- # "*Buddy waves at you*").
563
- def normalize_body!
564
- normalize_body_entities!()
565
- # Fix mismatched tags. Yes, it's faster to do it per-message
566
- # than all at once.
567
- @body = Pidgin2Adium.balance_tags_c(@body)
568
- if @buddy_alias[0,3] == '***'
569
- # "***<alias>" is what pidgin sets as the alias for a /me action
570
- @buddy_alias.slice!(0,3)
571
- @body = '*' << @body << '*'
572
- end
573
- end
574
-
575
- # Escapes entities.
576
- def normalize_body_entities!
577
- # Convert '&' to '&amp;' only if it's not followed by an entity.
578
- @body.gsub!(/&(?!lt|gt|amp|quot|apos)/, '&amp;')
579
- end
580
- end # END XMLMessage
581
-
582
- # An auto reply message.
583
- class AutoReplyMessage < XMLMessage
584
- def to_s
585
- return sprintf('<message sender="%s" time="%s" auto="true" alias="%s">%s</message>' << "\n",
586
- @sender, @time, @buddy_alias, @styled_body)
587
- end
588
- end
589
-
590
- # A message saying e.g. "Blahblah has gone away."
591
- class StatusMessage < Message
592
- def initialize(sender, time, buddy_alias, status)
593
- super(sender, time, buddy_alias)
594
- @status = status
595
- end
596
- attr_accessor :status
597
-
598
- def to_s
599
- return sprintf('<status type="%s" sender="%s" time="%s" alias="%s"/>' << "\n", @status, @sender, @time, @buddy_alias)
600
- end
601
- end
602
-
603
- # Pidgin does not have Events, but Adium does. Pidgin mostly uses system
604
- # messages to display what Adium calls events. These include sending a file,
605
- # starting a Direct IM connection, or an error in chat.
606
- class Event < XMLMessage
607
- def initialize(sender, time, buddy_alias, body, event_type)
608
- super(sender, time, buddy_alias, body)
609
- @event_type = event_type
610
- end
611
- attr_accessor :event_type
612
-
613
- def to_s
614
- return sprintf('<event type="%s" sender="%s" time="%s" alias="%s">%s</event>',
615
- @event_type, @sender, @time, @buddy_alias, @styled_body)
616
- end
617
- end
618
- end # end module
1
+ require 'pidgin2adium/basic_parser'
2
+ require 'pidgin2adium/text_log_parser'
3
+ require 'pidgin2adium/html_log_parser'