pidgin2adium 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+ # HtmlLogParser class, a subclass of BasicParser.
2
+ # Used for parse()ing HTML logs.
3
+
4
+ require 'balance_tags_c'
5
+
6
+ module Pidgin2Adium
7
+ class HtmlLogParser < BasicParser
8
+ def initialize(src_path, user_aliases)
9
+ super(src_path, user_aliases)
10
+ @timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'
11
+
12
+ # @line_regex matches a line in an HTML log file other than the
13
+ # first time matches on either "2008-11-17 14:12" or "14:12"
14
+ # @line_regex match obj:
15
+ # 0: timestamp, extended or not
16
+ # 1: screen name or alias, if alias set
17
+ # 2: "&lt;AUTO-REPLY&gt;" or nil
18
+ # 3: message body
19
+ # The ":" is optional to allow for strings like "(17:12:21) <b>***Gabe B-W</b> is confused<br/>"
20
+ @line_regex = /#{@timestamp_rx} ?<b>(.+?) ?(&lt;AUTO-REPLY&gt;)?:?<\/b> ?(.+)<br ?\/>/o
21
+ # @line_regex_status matches a status line
22
+ # @line_regex_status match obj:
23
+ # 0: timestamp
24
+ # 1: status message
25
+ @line_regex_status = /#{@timestamp_rx} ?<b> (.+)<\/b><br ?\/>/o
26
+ end
27
+
28
+ # Returns a cleaned string.
29
+ # Removes the following tags from _text_:
30
+ # * html
31
+ # * body
32
+ # * font
33
+ # * a with no innertext, e.g. <a href="blah"></a>
34
+ # And removes the following style declarations:
35
+ # * color: #000000 (just turns text black)
36
+ # * font-family
37
+ # * font-size
38
+ # * background
39
+ # * em (really it's changed to <span style="font-style: italic;">)
40
+ # Since each <span> has only one style declaration, spans with these
41
+ # declarations are removed (but the text inside them is preserved).
42
+ def cleanup(text)
43
+ # Sometimes this is in there. I don't know why.
44
+ text.gsub!(%r{&lt;/FONT HSPACE='\d'>}, '')
45
+ # We can remove <font> safely since Pidgin and Adium both show bold
46
+ # using <span style="font-weight: bold;"> except Pidgin uses single
47
+ # quotes while Adium uses double quotes.
48
+ text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!
49
+
50
+ text.tr!("\r", '')
51
+ # Remove empty lines
52
+ text.gsub!("\n\n", "\n")
53
+
54
+ # Remove newlines that end the file, since they screw up the
55
+ # newline -> <br/> conversion
56
+ text.gsub!(/\n\Z/, '')
57
+
58
+ # Replace newlines with "<br/>" unless they end a chat line.
59
+ # This must go after we remove <font> tags.
60
+ text.gsub!(/\n(?!#{@timestamp_rx})/, '<br/>')
61
+
62
+ # These empty links are sometimes appended to every line in a chat,
63
+ # for some weird reason. Remove them.
64
+ text.gsub!(%r{<a href=['"].+?['"]>\s*?</a>}, '')
65
+
66
+ # Replace single quotes inside tags with double quotes so we can
67
+ # easily change single quotes to entities.
68
+ # For spans, removes a space after the final declaration if it exists.
69
+ text.gsub!(/<span style='([^']+?;) ?'>/, '<span style="\1">')
70
+ text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
71
+ =begin
72
+ text.gsub!(/<a href='(.+?)'>/, '<a href="\1">')
73
+ text.gsub!(/<img src='([^']+?)'/, '<img src="\1"')
74
+ text.gsub!(/ alt='([^']+?)'/, ' alt="\1"')
75
+ =end
76
+ text.gsub!("'", '&apos;')
77
+
78
+ # This actually does match stuff, but doesn't group it correctly. :(
79
+ # text.gsub!(%r{<span style="((?:.+?;)+)">(.*?)</span>}) do |s|
80
+ text.gsub!(%r{<span style="(.+?)">(.*?)</span>}) do |s|
81
+ # Remove empty spans.
82
+ next if $2 == ''
83
+
84
+ # style = style declaration
85
+ # innertext = text inside <span>
86
+ style, innertext = $1, $2
87
+ # TODO: replace double quotes with "&quot;", but only outside tags; may still be tags inside spans
88
+ # innertext.gsub!("")
89
+
90
+ styleparts = style.split(/; ?/)
91
+ styleparts.map! do |p|
92
+ if p[0,5] == 'color'
93
+ if p.include?('color: #000000')
94
+ next
95
+ elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
96
+ # Regarding the bit with the ">", sometimes this happens:
97
+ # <span style="color: #000000>today;">today was busy</span>
98
+ # Then p = "color: #000000>today"
99
+ # Or it can end in ">;", with no text before the semicolon.
100
+ # So keep the color but remove the ">" and anything following it.
101
+ next($1)
102
+ end
103
+ else
104
+ # don't remove font-weight
105
+ case p
106
+ when /^font-family/ then next
107
+ when /^font-size/ then next
108
+ when /^background/ then next
109
+ end
110
+ end
111
+ end.compact!
112
+ unless styleparts.empty?
113
+ style = styleparts.join('; ')
114
+ innertext = "<span style=\"#{style};\">#{innertext}</span>"
115
+ end
116
+ innertext
117
+ end
118
+ # Pidgin uses <em>, Adium uses <span>
119
+ if text.gsub!('<em>', '<span style="font-style: italic;">')
120
+ text.gsub!('</em>', '</span>')
121
+ end
122
+ return text
123
+ end
124
+ end # END HtmlLogParser class
125
+ end
@@ -20,8 +20,12 @@ module Pidgin2Adium
20
20
  @my_aliases = aliases
21
21
 
22
22
  unless File.directory?(@pidgin_log_dir)
23
- puts "Source directory #{@pidgin_log_dir} does not exist or is not a directory."
24
- raise Errno::ENOENT
23
+ msg = "Source directory #{@pidgin_log_dir} does not exist or is not a directory."
24
+ error(msg)
25
+
26
+ # ENOENT automatically prepends "No such file or directory - " to
27
+ # its initializer's arguments
28
+ raise Errno::ENOENT.new("source directory #{@pidgin_log_dir}")
25
29
  end
26
30
  end
27
31
 
@@ -31,11 +35,11 @@ module Pidgin2Adium
31
35
  def start
32
36
  log_msg "Begin converting."
33
37
  begin
34
- files_path = get_all_chat_files(@pidgin_log_dir)
38
+ files_path = get_all_chat_files()
35
39
  rescue Errno::EACCES => bang
36
40
  error("Sorry, permission denied for getting Pidgin chat files from #{@pidgin_log_dir}.")
37
41
  error("Details: #{bang.message}")
38
- raise Errno::EACCES
42
+ raise bang
39
43
  end
40
44
 
41
45
  total_files = files_path.size
@@ -52,21 +56,16 @@ module Pidgin2Adium
52
56
 
53
57
  delete_search_indexes()
54
58
 
55
- log_msg "Finished converting! Converted #{total_successes} files of #{total_files} total."
59
+ Pidgin2Adium.log_msg "Finished converting! Converted #{total_successes} files of #{total_files} total."
56
60
  puts "Minor error messages:"
57
61
  puts @@oops_messages.join("\n")
58
62
  puts "Major error messages:"
59
63
  puts @@error_messages.join("\n")
60
64
  end
61
65
 
62
- ###########
63
- private
64
- ###########
65
-
66
- def get_all_chat_files(dir)
67
- return [] if File.basename(dir) == ".system"
66
+ def get_all_chat_files
68
67
  # recurse into each subdir
69
- return (Dir.glob("#{@pidgin_log_dir}/**/*.{htm,html,txt}") - BAD_DIRS)
68
+ Dir.glob("#{@pidgin_log_dir}/**/*.{htm,html,txt}") - BAD_DIRS
70
69
  end
71
- end # END LogConverter class
70
+ end
72
71
  end
@@ -98,5 +98,5 @@ module Pidgin2Adium
98
98
 
99
99
  return output_path
100
100
  end
101
- end # END LogFile class
101
+ end
102
102
  end
@@ -1,618 +1,3 @@
1
- # Contains the class BasicParser and its subclasses, HtmlLogParser and
2
- # TextFileParser, which parse the file passed into it and return a LogFile
3
- # object.
4
- #
5
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
6
- # using these classes directly.
7
- require 'parsedate'
8
- require 'time' # for Time.zone_offset
9
-
10
- require 'balance_tags_c'
11
- require 'pidgin2adium/log_file'
12
-
13
- module Pidgin2Adium
14
- # Empty class. Raise'd by LogParser if the first line of a log is not
15
- # parseable.
16
- class InvalidFirstLineError < StandardError; end
17
-
18
- # BasicParser is a base class. Its subclasses are TextLogParser and
19
- # HtmlLogParser.
20
- #
21
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
22
- # using this class directly.
23
- class BasicParser
24
- include Pidgin2Adium
25
- def initialize(src_path, user_aliases)
26
- @src_path = src_path
27
- # Whitespace is removed for easy matching later on.
28
- @user_aliases = user_aliases.split(',').map!{|x| x.downcase.gsub(/\s+/,'') }.uniq
29
- # @user_alias is set each time get_sender_by_alias is called. It is a non-normalized
30
- # alias.
31
- # Set an initial value just in case the first message doesn't give
32
- # us an alias.
33
- @user_alias = user_aliases.split(',')[0]
34
-
35
- @tz_offset = get_time_zone_offset()
36
-
37
- file = File.new(@src_path, 'r')
38
- @first_line = file.readline
39
- @file_content = file.read
40
- file.close
41
-
42
- # Time regexes must be set before pre_parse().
43
- # "4/18/2007 11:02:00 AM" => %w{4, 18, 2007, 11, 02, 00, AM}
44
- # ONLY used (if at all) in first line of chat ("Conversation with...at...")
45
- @time_regex_first_line = %r{^(\d{1,2})/(\d{1,2})/(\d{4}) (\d{1,2}):(\d{2}):(\d{2}) ([AP]M)$}
46
- # "2007-04-17 12:33:13" => %w{2007, 04, 17, 12, 33, 13}
47
- @time_regex = /^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})$/
48
- # sometimes a line in a chat doesn't have a full timestamp
49
- # "04:22:05 AM" => %w{04 22 05 AM}
50
- @minimal_time_regex = /^(\d{1,2}):(\d{2}):(\d{2})( [AP]M)?$/
51
-
52
- # Whether or not the first line is parseable.
53
- @first_line_is_valid = true
54
- begin
55
- @service,
56
- @user_SN,
57
- @partner_SN,
58
- # @basic_time_info is for files that only have the full
59
- # timestamp at the top; we can use it to fill in the minimal
60
- # per-line timestamps. It has only 3 elements (year, month,
61
- # dayofmonth) because you should be able to fill everything
62
- # else in. If you can't, something's wrong.
63
- @basic_time_info,
64
- # When the chat started, in Adium's format
65
- @adium_chat_time_start = pre_parse()
66
- rescue InvalidFirstLineError
67
- @first_line_is_valid = false
68
- error("Failed to parse, invalid first line: #{@src_path}")
69
- return # stop processing
70
- end
71
-
72
- # @status_map, @lib_purple_events, and @events are used in
73
- # create_status_or_event_msg
74
- @status_map = {
75
- /(.+) logged in\.$/ => 'online',
76
- /(.+) logged out\.$/ => 'offline',
77
- /(.+) has signed on\.$/ => 'online',
78
- /(.+) has signed off\.$/ => 'offline',
79
- /(.+) has gone away\.$/ => 'away',
80
- /(.+) is no longer away\.$/ => 'available',
81
- /(.+) has become idle\.$/ => 'idle',
82
- /(.+) is no longer idle\.$/ => 'available'
83
- }
84
-
85
- # lib_purple_events are all of event_type libPurple
86
- @lib_purple_events = [
87
- # file transfer
88
- /Starting transfer of .+ from (.+)/,
89
- /^Offering to send .+ to (.+)$/,
90
- /(.+) is offering to send file/,
91
- /^Transfer of file .+ complete$/,
92
- /Error reading|writing|accessing .+: .+/,
93
- /You cancell?ed the transfer of/,
94
- /File transfer cancelled/,
95
- /(.+?) cancell?ed the transfer of/,
96
- /(.+?) cancelled the file transfer/,
97
- # Direct IM - actual (dis)connect events are their own types
98
- /^Attempting to connect to (.+) at .+ for Direct IM\./,
99
- /^Asking (.+) to connect to us at .+ for Direct IM\./,
100
- /^Attempting to connect via proxy server\.$/,
101
- /^Direct IM with (.+) failed/,
102
- # encryption
103
- /Received message encrypted with wrong key/,
104
- /^Requesting key\.\.\.$/,
105
- /^Outgoing message lost\.$/,
106
- /^Conflicting Key Received!$/,
107
- /^Error in decryption- asking for resend\.\.\.$/,
108
- /^Making new key pair\.\.\.$/,
109
- # sending errors
110
- /^Last outgoing message not received properly- resetting$/,
111
- /Resending\.\.\./,
112
- # connection errors
113
- /Lost connection with the remote user:.+/,
114
- # chats
115
- /^.+ entered the room\.$/,
116
- /^.+ left the room\.$/
117
- ]
118
-
119
- # non-libpurple events
120
- # Each key maps to an event_type string. The keys will be matched against a line of chat
121
- # and the partner's alias will be in regex group 1, IF the alias is matched.
122
- @event_map = {
123
- # .+ is not an alias, it's a proxy server so no grouping
124
- /^Attempting to connect to .+\.$/ => 'direct-im-connect',
125
- # NB: pidgin doesn't track when Direct IM is disconnected, AFAIK
126
- /^Direct IM established$/ => 'directIMConnected',
127
- /Unable to send message/ => 'chat-error',
128
- /You missed .+ messages from (.+) because they were too large/ => 'chat-error',
129
- /User information not available/ => 'chat-error'
130
- }
131
-
132
- @ignore_events = [
133
- # Adium ignores SN/alias changes.
134
- /^.+? is now known as .+?\.<br\/?>$/
135
- ]
136
- end
137
-
138
- # This method returns a LogFile instance, or false if an error occurred.
139
- def parse
140
- return false unless @first_line_is_valid
141
- @file_content = cleanup(@file_content).split("\n")
142
-
143
- @file_content.map! do |line|
144
- # "next" returns nil which is removed by compact
145
- next if line =~ /^\s+$/
146
- if line =~ @line_regex
147
- create_msg($~.captures)
148
- elsif line =~ @line_regex_status
149
- msg = create_status_or_event_msg($~.captures)
150
- # Error occurred while parsing
151
- return false if msg == false
152
- else
153
- error "Could not parse line:"
154
- p line
155
- return false
156
- end
157
- end
158
- @file_content.compact!
159
- return LogFile.new(@file_content, @service, @user_SN, @partner_SN, @adium_chat_time_start)
160
- end
161
- # Prevent parse from being called directly from BasicParser, since
162
- # it uses subclassing magic.
163
- protected :parse
164
-
165
- #################
166
- private
167
- #################
168
-
169
- def get_time_zone_offset()
170
- # We must have a tz_offset or else the Adium Chat Log viewer
171
- # doesn't read the date correctly and then:
172
- # 1) the log has an empty start date column in the viewer
173
- # 2) The timestamps are all the same for the whole log
174
- tz_match = /([-\+]\d+)[A-Z]{3}\.(?:txt|htm|html)/.match(@src_path)
175
- if tz_match and tz_match[1]
176
- tz_offset = tz_match[1]
177
- else
178
- # "-0500" (3d rather than 2d to allow for "+")
179
- tz_offset = sprintf('%+03d00', Time.zone_offset(Time.now.zone) / 3600)
180
- end
181
- return tz_offset
182
- end
183
-
184
- #--
185
- # Adium time format: YYYY-MM-DD\THH:MM:SS[+-]TZ_HRS like:
186
- # 2008-10-05T22:26:20-0800
187
- # HOWEVER:
188
- # If it's the first line, then return it like this (note periods):
189
- # 2008-10-05T22.26.20-0800
190
- # because it will be used in the filename.
191
- #++
192
- # Converts a pidgin datestamp to an Adium one.
193
- def create_adium_time(time, is_first_line = false)
194
- # parsed_date = [year, month, day, hour, min, sec]
195
- if time =~ @time_regex
196
- year, month, day, hour, min, sec = $1.to_i,
197
- $2.to_i,
198
- $3.to_i,
199
- $4.to_i,
200
- $5.to_i,
201
- $6.to_i
202
- elsif is_first_line and time =~ @time_regex_first_line
203
- hour = $4.to_i
204
- if $7 == 'PM' and hour != 12
205
- hour += 12
206
- end
207
- year, month, day, min, sec = $3.to_i, # year
208
- $1.to_i, # month
209
- $2.to_i, # day
210
- # already did hour
211
- $5.to_i, # minutes
212
- $6.to_i # seconds
213
- elsif time =~ @minimal_time_regex
214
- # "04:22:05" => %w{04 22 05}
215
- hour = $1.to_i
216
- if $4 == 'PM' and hour != 12
217
- hour += 12
218
- end
219
- year, month, day = @basic_time_info
220
- min = $2.to_i
221
- sec = $3.to_i
222
- else
223
- error("You have found an odd timestamp. Please report it to the developer.")
224
- log_msg("The timestamp: #{time}")
225
- log_msg("Continuing...")
226
- year,month,day,hour,min,sec = ParseDate.parsedate(time)
227
- end
228
- if is_first_line
229
- adium_time = Time.local(year,month,day,hour,min,sec).strftime("%Y-%m-%dT%H.%M.%S#{@tz_offset}")
230
- else
231
- adium_time = Time.local(year,month,day,hour,min,sec).strftime("%Y-%m-%dT%H:%M:%S#{@tz_offset}")
232
- end
233
- return adium_time
234
- end
235
-
236
- # Extract required data from the file. Run by parse.
237
- def pre_parse
238
- # Deal with first line.
239
-
240
- # the first line is special. It tells us (in order of regex groups):
241
- # 1) who we're talking to
242
- # 2) what time/date
243
- # 3) what SN we used
244
- # 4) what protocol (AIM, icq, jabber...)
245
- first_line_match = /Conversation with (.+?) at (.+?) on (.+?) \((.+?)\)/.match(@first_line)
246
- if first_line_match.nil?
247
- raise InvalidFirstLineError
248
- else
249
- service = first_line_match[4]
250
- # @user_SN is normalized to avoid "AIM.name" and "AIM.na me" folders
251
- user_SN = first_line_match[3].downcase.tr(' ', '')
252
- partner_SN = first_line_match[1]
253
- pidgin_chat_time_start = first_line_match[2]
254
- basic_time_info = case pidgin_chat_time_start
255
- when @time_regex then [$1.to_i, $2.to_i, $3.to_i]
256
- when @time_regex_first_line then [$3.to_i, $1.to_i, $2.to_i]
257
- end
258
- adium_chat_time_start = create_adium_time(pidgin_chat_time_start, true)
259
- return [service,
260
- user_SN,
261
- partner_SN,
262
- basic_time_info,
263
- adium_chat_time_start]
264
- end
265
- end
266
-
267
- def get_sender_by_alias(alias_name)
268
- no_action = alias_name.sub(/^\*{3}/, '')
269
- if @user_aliases.include? no_action.downcase.gsub(/\s+/, '')
270
- # Set the current alias being used of the ones in @user_aliases
271
- @user_alias = no_action
272
- return @user_SN
273
- else
274
- return @partner_SN
275
- end
276
- end
277
-
278
- #--
279
- # create_msg takes an array of captures from matching against
280
- # @line_regex and returns a Message object or one of its subclasses.
281
- # It can be used for TextLogParser and HtmlLogParser because both of
282
- # them return data in the same indexes in the matches array.
283
- #++
284
- def create_msg(matches)
285
- msg = nil
286
- # Either a regular message line or an auto-reply/away message.
287
- time = create_adium_time(matches[0])
288
- buddy_alias = matches[1]
289
- sender = get_sender_by_alias(buddy_alias)
290
- body = matches[3]
291
- if matches[2] # auto-reply
292
- msg = AutoReplyMessage.new(sender, time, buddy_alias, body)
293
- else
294
- # normal message
295
- msg = XMLMessage.new(sender, time, buddy_alias, body)
296
- end
297
- return msg
298
- end
299
-
300
- #--
301
- # create_status_or_event_msg takes an array of +MatchData+ captures from
302
- # matching against @line_regex_status and returns an Event or Status.
303
- # Returns nil if it's a message that should be ignored, or false if an
304
- # error occurred.
305
- #++
306
- def create_status_or_event_msg(matches)
307
- # ["22:58:00", "BuddyName logged in."]
308
- # 0: time
309
- # 1: status message or event
310
- msg = nil
311
- time = create_adium_time(matches[0])
312
- str = matches[1]
313
- # Return nil, which will get compact'ed out
314
- return nil if @ignore_events.detect{|regex| str =~ regex }
315
-
316
- regex, status = @status_map.detect{|regex, status| str =~ regex}
317
- if regex and status
318
- # Status message
319
- buddy_alias = regex.match(str)[1]
320
- sender = get_sender_by_alias(buddy_alias)
321
- msg = StatusMessage.new(sender, time, buddy_alias, status)
322
- else
323
- # Test for event
324
- regex = @lib_purple_events.detect{|regex| str =~ regex }
325
- event_type = 'libpurpleEvent' if regex
326
- unless regex and event_type
327
- # not a libpurple event, try others
328
- if @event_map.detect{|regex,event_type| str =~ regex}
329
- regex, event_type = $1, $2
330
- else
331
- error(sprintf("Error parsing status or event message, no status or event found: %p", str))
332
- return false
333
- end
334
- end
335
- if regex and event_type
336
- regex_matches = regex.match(str)
337
- # Event message
338
- if regex_matches.size == 1
339
- # No alias - this means it's the user
340
- buddy_alias = @user_alias
341
- sender = @user_SN
342
- else
343
- buddy_alias = regex_matches[1]
344
- sender = get_sender_by_alias(buddy_alias)
345
- end
346
- msg = Event.new(sender, time, buddy_alias, str, event_type)
347
- end
348
- end
349
- return msg
350
- end
351
- end # END BasicParser class
352
-
353
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
354
- # using this class directly.
355
- class TextLogParser < BasicParser
356
- def initialize(src_path, user_aliases)
357
- super(src_path, user_aliases)
358
- @timestamp_rx = '\((\d{1,2}:\d{1,2}:\d{1,2})\)'
359
-
360
- # @line_regex matches a line in a TXT log file other than the first
361
- # @line_regex matchdata:
362
- # 0: timestamp
363
- # 1: screen name or alias, if alias set
364
- # 2: "<AUTO-REPLY>" or nil
365
- # 3: message body
366
- @line_regex = /#{@timestamp_rx} (.*?) ?(<AUTO-REPLY>)?: (.*)/o
367
-
368
- # @line_regex_status matches a status line
369
- # @line_regex_status matchdata:
370
- # 0: timestamp
371
- # 1: status message
372
- @line_regex_status = /#{@timestamp_rx} ([^:]+)/o
373
- end
374
-
375
- public :parse
376
-
377
- #################
378
- private
379
- #################
380
-
381
- def cleanup(text)
382
- text.tr!("\r", '')
383
- # Replace newlines with "<br/>" unless they end a chat line.
384
- text.gsub!(/\n(?!#{@timestamp_rx}|\Z)/, '<br/>')
385
- # Escape entities since this will be in XML
386
- text.gsub!('&', '&amp;') # escape '&' first
387
- text.gsub!('<', '&lt;')
388
- text.gsub!('>', '&gt;')
389
- text.gsub!('"', '&quot;')
390
- text.gsub!("'", '&apos;')
391
- return text
392
- end
393
- end
394
-
395
- # Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead
396
- # of using this class directly.
397
- class HtmlLogParser < BasicParser
398
- def initialize(src_path, user_aliases)
399
- super(src_path, user_aliases)
400
- @timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'
401
-
402
- # @line_regex matches a line in an HTML log file other than the
403
- # first time matches on either "2008-11-17 14:12" or "14:12"
404
- # @line_regex match obj:
405
- # 0: timestamp, extended or not
406
- # 1: screen name or alias, if alias set
407
- # 2: "&lt;AUTO-REPLY&gt;" or nil
408
- # 3: message body
409
- # The ":" is optional to allow for strings like "(17:12:21) <b>***Gabe B-W</b> is confused<br/>"
410
- @line_regex = /#{@timestamp_rx} ?<b>(.+?) ?(&lt;AUTO-REPLY&gt;)?:?<\/b> ?(.+)<br ?\/>/o
411
- # @line_regex_status matches a status line
412
- # @line_regex_status match obj:
413
- # 0: timestamp
414
- # 1: status message
415
- @line_regex_status = /#{@timestamp_rx} ?<b> (.+)<\/b><br ?\/>/o
416
- end
417
-
418
- public :parse
419
-
420
- #################
421
- private
422
- #################
423
-
424
- # Returns a cleaned string.
425
- # Removes the following tags from _text_:
426
- # * html
427
- # * body
428
- # * font
429
- # * a with no innertext, e.g. <a href="blah"></a>
430
- # And removes the following style declarations:
431
- # * color: #000000 (just turns text black)
432
- # * font-family
433
- # * font-size
434
- # * background
435
- # * em (really it's changed to <span style="font-style: italic;">)
436
- # Since each <span> has only one style declaration, spans with these
437
- # declarations are removed (but the text inside them is preserved).
438
- def cleanup(text)
439
- # Sometimes this is in there. I don't know why.
440
- text.gsub!(%r{&lt;/FONT HSPACE='\d'>}, '')
441
- # We can remove <font> safely since Pidgin and Adium both show bold
442
- # using <span style="font-weight: bold;"> except Pidgin uses single
443
- # quotes while Adium uses double quotes.
444
- text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!
445
-
446
- text.tr!("\r", '')
447
- # Remove empty lines
448
- text.gsub!("\n\n", "\n")
449
-
450
- # Remove newlines that end the file, since they screw up the
451
- # newline -> <br/> conversion
452
- text.gsub!(/\n\Z/, '')
453
-
454
- # Replace newlines with "<br/>" unless they end a chat line.
455
- # This must go after we remove <font> tags.
456
- text.gsub!(/\n(?!#{@timestamp_rx})/, '<br/>')
457
-
458
- # These empty links are sometimes appended to every line in a chat,
459
- # for some weird reason. Remove them.
460
- text.gsub!(%r{<a href=('").+?\1>\s*?</a>}, '')
461
-
462
- # Replace single quotes inside tags with double quotes so we can
463
- # easily change single quotes to entities.
464
- # For spans, removes a space after the final declaration if it exists.
465
- text.gsub!(/<span style='([^']+?;) ?'>/, '<span style="\1">')
466
- text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
467
- =begin
468
- text.gsub!(/<a href='(.+?)'>/, '<a href="\1">')
469
- text.gsub!(/<img src='([^']+?)'/, '<img src="\1"')
470
- text.gsub!(/ alt='([^']+?)'/, ' alt="\1"')
471
- =end
472
- text.gsub!("'", '&apos;')
473
-
474
- # This actually does match stuff, but doesn't group it correctly. :(
475
- # text.gsub!(%r{<span style="((?:.+?;)+)">(.*?)</span>}) do |s|
476
- text.gsub!(%r{<span style="(.+?)">(.*?)</span>}) do |s|
477
- # Remove empty spans.
478
- next if $2 == ''
479
-
480
- # style = style declaration
481
- # innertext = text inside <span>
482
- style, innertext = $1, $2
483
- # TODO: replace double quotes with "&quot;", but only outside tags; may still be tags inside spans
484
- # innertext.gsub!("")
485
-
486
- styleparts = style.split(/; ?/)
487
- styleparts.map! do |p|
488
- if p[0,5] == 'color'
489
- if p.include?('color: #000000')
490
- next
491
- elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
492
- # Regarding the bit with the ">", sometimes this happens:
493
- # <span style="color: #000000>today;">today was busy</span>
494
- # Then p = "color: #000000>today"
495
- # Or it can end in ">;", with no text before the semicolon.
496
- # So keep the color but remove the ">" and anything following it.
497
- next($1)
498
- end
499
- else
500
- # don't remove font-weight
501
- case p
502
- when /^font-family/ then next
503
- when /^font-size/ then next
504
- when /^background/ then next
505
- end
506
- end
507
- end.compact!
508
- unless styleparts.empty?
509
- style = styleparts.join('; ')
510
- innertext = "<span style=\"#{style};\">#{innertext}</span>"
511
- end
512
- innertext
513
- end
514
- # Pidgin uses <em>, Adium uses <span>
515
- if text.gsub!('<em>', '<span style="font-style: italic;">')
516
- text.gsub!('</em>', '</span>')
517
- end
518
- return text
519
- end
520
- end # END HtmlLogParser class
521
-
522
- # A holding object for each line of the chat. It is subclassed as
523
- # appropriate (eg AutoReplyMessage). Each subclass (but not Message
524
- # itself) has its own to_s which prints out its information in a format
525
- # appropriate for putting in an Adium log file.
526
- # Subclasses: XMLMessage, AutoReplyMessage, StatusMessage, Event.
527
- class Message
528
- def initialize(sender, time, buddy_alias)
529
- # The sender's screen name
530
- @sender = sender
531
- # The time the message was sent, in Adium format (e.g.
532
- # "2008-10-05T22:26:20-0800")
533
- @time = time
534
- # The receiver's alias (NOT screen name)
535
- @buddy_alias = buddy_alias
536
- end
537
- attr_accessor :sender, :time, :buddy_alias
538
- end
539
-
540
- # Basic message with body text (as opposed to pure status messages, which
541
- # have no body).
542
- class XMLMessage < Message
543
- def initialize(sender, time, buddy_alias, body)
544
- super(sender, time, buddy_alias)
545
- @body = body
546
- @styled_body = '<div><span style="font-family: Helvetica; font-size: 12pt;">%s</span></div>' % @body
547
- normalize_body!()
548
- end
549
- attr_accessor :body
550
-
551
- def to_s
552
- return sprintf('<message sender="%s" time="%s" alias="%s">%s</message>' << "\n",
553
- @sender, @time, @buddy_alias, @styled_body)
554
- end
555
-
556
- #################
557
- private
558
- #################
559
-
560
- # Balances mismatched tags, normalizes body style, and fixes actions
561
- # so they are in Adium style (Pidgin uses "***Buddy waves at you", Adium uses
562
- # "*Buddy waves at you*").
563
- def normalize_body!
564
- normalize_body_entities!()
565
- # Fix mismatched tags. Yes, it's faster to do it per-message
566
- # than all at once.
567
- @body = Pidgin2Adium.balance_tags_c(@body)
568
- if @buddy_alias[0,3] == '***'
569
- # "***<alias>" is what pidgin sets as the alias for a /me action
570
- @buddy_alias.slice!(0,3)
571
- @body = '*' << @body << '*'
572
- end
573
- end
574
-
575
- # Escapes entities.
576
- def normalize_body_entities!
577
- # Convert '&' to '&amp;' only if it's not followed by an entity.
578
- @body.gsub!(/&(?!lt|gt|amp|quot|apos)/, '&amp;')
579
- end
580
- end # END XMLMessage
581
-
582
- # An auto reply message.
583
- class AutoReplyMessage < XMLMessage
584
- def to_s
585
- return sprintf('<message sender="%s" time="%s" auto="true" alias="%s">%s</message>' << "\n",
586
- @sender, @time, @buddy_alias, @styled_body)
587
- end
588
- end
589
-
590
- # A message saying e.g. "Blahblah has gone away."
591
- class StatusMessage < Message
592
- def initialize(sender, time, buddy_alias, status)
593
- super(sender, time, buddy_alias)
594
- @status = status
595
- end
596
- attr_accessor :status
597
-
598
- def to_s
599
- return sprintf('<status type="%s" sender="%s" time="%s" alias="%s"/>' << "\n", @status, @sender, @time, @buddy_alias)
600
- end
601
- end
602
-
603
- # Pidgin does not have Events, but Adium does. Pidgin mostly uses system
604
- # messages to display what Adium calls events. These include sending a file,
605
- # starting a Direct IM connection, or an error in chat.
606
- class Event < XMLMessage
607
- def initialize(sender, time, buddy_alias, body, event_type)
608
- super(sender, time, buddy_alias, body)
609
- @event_type = event_type
610
- end
611
- attr_accessor :event_type
612
-
613
- def to_s
614
- return sprintf('<event type="%s" sender="%s" time="%s" alias="%s">%s</event>',
615
- @event_type, @sender, @time, @buddy_alias, @styled_body)
616
- end
617
- end
618
- end # end module
1
+ require 'pidgin2adium/basic_parser'
2
+ require 'pidgin2adium/text_log_parser'
3
+ require 'pidgin2adium/html_log_parser'