pipio 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.rspec +2 -0
  4. data/.simplecov +5 -0
  5. data/.travis.yml +12 -0
  6. data/Gemfile +3 -0
  7. data/LICENSE +20 -0
  8. data/NEWS.md +10 -0
  9. data/README.md +88 -0
  10. data/Rakefile +13 -0
  11. data/lib/pipio.rb +34 -0
  12. data/lib/pipio/alias_registry.rb +26 -0
  13. data/lib/pipio/chat.rb +39 -0
  14. data/lib/pipio/cleaners/html_cleaner.rb +95 -0
  15. data/lib/pipio/cleaners/text_cleaner.rb +15 -0
  16. data/lib/pipio/file_reader.rb +29 -0
  17. data/lib/pipio/message_creators/auto_or_xml_message_creator.rb +25 -0
  18. data/lib/pipio/message_creators/event_message_creator.rb +47 -0
  19. data/lib/pipio/message_creators/status_message_creator.rb +19 -0
  20. data/lib/pipio/messages/auto_reply_message.rb +7 -0
  21. data/lib/pipio/messages/event.rb +67 -0
  22. data/lib/pipio/messages/message.rb +23 -0
  23. data/lib/pipio/messages/status_message.rb +26 -0
  24. data/lib/pipio/messages/xml_message.rb +43 -0
  25. data/lib/pipio/metadata.rb +34 -0
  26. data/lib/pipio/metadata_parser.rb +55 -0
  27. data/lib/pipio/parser_factory.rb +32 -0
  28. data/lib/pipio/parsers/basic_parser.rb +83 -0
  29. data/lib/pipio/parsers/html_log_parser.rb +22 -0
  30. data/lib/pipio/parsers/null_parser.rb +9 -0
  31. data/lib/pipio/parsers/text_log_parser.rb +21 -0
  32. data/lib/pipio/tag_balancer.rb +163 -0
  33. data/lib/pipio/time_parser.rb +36 -0
  34. data/lib/pipio/version.rb +3 -0
  35. data/pipio.gemspec +27 -0
  36. data/spec/pipio/alias_registry_spec.rb +37 -0
  37. data/spec/pipio/chat_spec.rb +66 -0
  38. data/spec/pipio/cleaners/html_cleaner_spec.rb +102 -0
  39. data/spec/pipio/cleaners/text_cleaner_spec.rb +29 -0
  40. data/spec/pipio/file_reader_spec.rb +130 -0
  41. data/spec/pipio/messages/auto_reply_message_spec.rb +40 -0
  42. data/spec/pipio/messages/event_spec.rb +41 -0
  43. data/spec/pipio/messages/status_message_spec.rb +43 -0
  44. data/spec/pipio/messages/xml_message_spec.rb +55 -0
  45. data/spec/pipio/metadata_parser_spec.rb +81 -0
  46. data/spec/pipio/metadata_spec.rb +72 -0
  47. data/spec/pipio/parser_factory_spec.rb +31 -0
  48. data/spec/pipio/parsers/html_log_parser_spec.rb +160 -0
  49. data/spec/pipio/parsers/null_parser_spec.rb +13 -0
  50. data/spec/pipio/parsers/text_log_parser_spec.rb +37 -0
  51. data/spec/pipio/tag_balancer_spec.rb +16 -0
  52. data/spec/pipio/time_parser_spec.rb +66 -0
  53. data/spec/pipio_spec.rb +63 -0
  54. data/spec/spec_helper.rb +18 -0
  55. data/spec/support/chat_builder.rb +29 -0
  56. data/spec/support/chat_builder_helpers.rb +41 -0
  57. data/spec/support/file_builder.rb +22 -0
  58. data/spec/support/html_chat_builder.rb +67 -0
  59. data/spec/support/logfiles/2006-12-21.223606.txt +3 -0
  60. data/spec/support/logfiles/2008-01-15.071445-0500PST.htm +5 -0
  61. data/spec/support/logfiles/2008-01-15.071445-0500PST.html +5 -0
  62. data/spec/support/text_chat_builder.rb +21 -0
  63. data/spec/test-output/README.md +1 -0
  64. data/spec/test-output/html_log_output.xml +6 -0
  65. data/spec/test-output/text_log_output.xml +4 -0
  66. metadata +193 -0
@@ -0,0 +1,22 @@
1
+ module Pipio
2
+ class HtmlLogParser
3
+ TIMESTAMP_REGEX = /\((?<timestamp>(?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)/
4
+
5
+ def initialize(source_file_path, user_aliases)
6
+ # @line_regex matches a line in an HTML log file other than the first.
7
+ line_regex = /#{TIMESTAMP_REGEX} ?<b>(?<sn_or_alias>.+?) ?(?<auto_reply>&lt;AUTO-REPLY&gt;)?:?<\/b> ?(?<body>.+)<br ?\/>/o
8
+
9
+ # @line_regex_status matches a status or event line.
10
+ line_regex_status = /#{TIMESTAMP_REGEX} ?<b> (?<body>.+)<\/b><br ?\/>/o
11
+
12
+ cleaner = Cleaners::HtmlCleaner
13
+
14
+ @parser = BasicParser.new(source_file_path, user_aliases, line_regex,
15
+ line_regex_status, cleaner)
16
+ end
17
+
18
+ def parse
19
+ @parser.parse
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module Pipio
2
+ class NullParser
3
+ def initialize(logfile_path, aliases)
4
+ end
5
+
6
+ def parse
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,21 @@
1
+ module Pipio
2
+ class TextLogParser
3
+ TIMESTAMP_REGEX = '\((?<timestamp>\d{1,2}:\d{1,2}:\d{1,2})\)'
4
+
5
+ def initialize(source_file_path, user_aliases)
6
+ # @line_regex matches a line in a text log file other than the first.
7
+ line_regex = /#{TIMESTAMP_REGEX} (?<sn_or_alias>.*?) ?(?<auto_reply><AUTO-REPLY>)?: (?<body>.*)/o
8
+ # @line_regex_status matches a status or event line.
9
+ line_regex_status = /#{TIMESTAMP_REGEX} (?<body>[^:]+)/o
10
+
11
+ cleaner = Cleaners::TextCleaner
12
+
13
+ @parser = BasicParser.new(source_file_path, user_aliases, line_regex,
14
+ line_regex_status, cleaner)
15
+ end
16
+
17
+ def parse
18
+ @parser.parse
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,163 @@
1
+ module Pipio
2
+ # Balances tags of string using a modified stack. Returns a balanced
3
+ # string, but also affects the text passed into it!
4
+ # Use text = balance_tags(text).
5
+
6
+ # From Wordpress's formatting.php; rewritten in Ruby by Gabe
7
+ # Berke-Williams, 2009.
8
+ # Author:: Leonard Lin <leonard@acm.org>
9
+ # License:: GPL v2.0
10
+ # Copyright:: November 4, 2001
11
+ class TagBalancer
12
+ def initialize(text)
13
+ @text = text
14
+
15
+ @tagstack = []
16
+ @stacksize = 0
17
+ @tagqueue = ''
18
+
19
+ # Known single-entity/self-closing tags
20
+ @self_closing_tags = %w(br hr img input meta)
21
+
22
+ # Tags that can be immediately nested within themselves
23
+ @nestable_tags = %w(blockquote div span font)
24
+
25
+ # 1: tagname, with possible leading "/"
26
+ # 2: attributes
27
+ @tag_regex = /<(\/?\w*)\s*([^>]*)>/
28
+ end
29
+
30
+ def balance
31
+ text = @text.dup
32
+ newtext = ''
33
+
34
+ @tagstack = []
35
+ @stacksize = 0
36
+ @tagqueue = ''
37
+
38
+ # WP bug fix for comments - in case you REALLY meant to type '< !--'
39
+ text.gsub!('< !--', '< !--')
40
+
41
+ # WP bug fix for LOVE <3 (and other situations with '<' before a number)
42
+ text.gsub!(/<([0-9]{1})/, '&lt;\1')
43
+
44
+ while ( pos = (text =~ @tag_regex) )
45
+ newtext << @tagqueue
46
+ tag = $1.downcase
47
+ attributes = $2
48
+ matchlen = $~[0].size
49
+
50
+ # clear the shifter
51
+ @tagqueue = ''
52
+ # Pop or Push
53
+ if end_tag?(tag)
54
+ tag.slice!(0,1)
55
+ if too_many_closing_tags?
56
+ tag = ''
57
+ #or close to be safe: tag = '/' << tag
58
+ elsif closing_tag?(tag)
59
+ # if stacktop value == tag close value then pop
60
+ tag = "</#{tag}>" # Close Tag
61
+ @tagstack.pop
62
+ @stacksize -= 1
63
+ else
64
+ # closing tag not at top, search for it
65
+ (@stacksize-1).downto(0) do |j|
66
+ if @tagstack[j] == tag
67
+ # add tag to tagqueue
68
+ ss = @stacksize - 1
69
+ ss.downto(j) do |k|
70
+ @tagqueue << "</#{@tagstack.pop}>"
71
+ @stacksize -= 1
72
+ end
73
+
74
+ break
75
+ end
76
+ end
77
+ tag = ''
78
+ end
79
+ else
80
+ # Begin Tag
81
+
82
+ # Tag Cleaning
83
+ if self_closing_attributes?(attributes) || empty_tag?(tag)
84
+ elsif self_closing_tag?(tag)
85
+ # ElseIf: it's a known single-entity tag but it doesn't close itself, do so
86
+ attributes << '/'
87
+ else
88
+ # Push the tag onto the stack
89
+ # If the top of the stack is the same as the tag we want to push, close previous tag
90
+ if (@stacksize > 0 &&
91
+ ! nestable?(tag) &&
92
+ @tagstack[@stacksize - 1] == tag)
93
+ @tagqueue = "</#{@tagstack.pop}>"
94
+ @stacksize -= 1
95
+ end
96
+ @tagstack.push(tag)
97
+ @stacksize += 1
98
+ end
99
+
100
+ # Attributes
101
+ if attributes != ''
102
+ attributes = ' ' + attributes
103
+ end
104
+ tag = "<#{tag}#{attributes}>"
105
+ #If already queuing a close tag, then put this tag on, too
106
+ if @tagqueue
107
+ @tagqueue << tag
108
+ tag = ''
109
+ end
110
+ end
111
+ newtext << text[0,pos] << tag
112
+ text = text[pos+matchlen, text.length - (pos+matchlen)]
113
+ end
114
+
115
+ # Clear Tag Queue
116
+ newtext << @tagqueue
117
+
118
+ # Add Remaining text
119
+ newtext << text
120
+
121
+ # Empty Stack
122
+ @tagstack.reverse_each do |t|
123
+ newtext << "</#{t}>" # Add remaining tags to close
124
+ end
125
+
126
+ # WP fix for the bug with HTML comments
127
+ newtext.gsub!("< !--", "<!--")
128
+ newtext.gsub!("< !--", "< !--")
129
+
130
+ newtext
131
+ end
132
+
133
+ private
134
+
135
+ def end_tag?(string)
136
+ string[0,1] == "/"
137
+ end
138
+
139
+ def closing_tag?(tag)
140
+ @tagstack[@stacksize - 1] == tag
141
+ end
142
+
143
+ def too_many_closing_tags?
144
+ @stacksize <= 0
145
+ end
146
+
147
+ def self_closing_attributes?(attributes)
148
+ attributes[-1,1] == '/'
149
+ end
150
+
151
+ def empty_tag?(tag)
152
+ tag == ''
153
+ end
154
+
155
+ def self_closing_tag?(tag)
156
+ @self_closing_tags.include?(tag)
157
+ end
158
+
159
+ def nestable?(tag)
160
+ @nestable_tags.include?(tag)
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,36 @@
1
+ module Pipio
2
+ class TimeParser
3
+ NO_DATE = /\A\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?\Z/
4
+
5
+ # 01/22/2008 03:01:45 PM
6
+ UNPARSEABLE_BY_DATETIME_PARSE = '%m/%d/%Y %I:%M:%S %P'
7
+
8
+ def initialize(year, month, day)
9
+ @fallback_date_string = "#{year}-#{month}-#{day}"
10
+ end
11
+
12
+ def parse(timestamp)
13
+ if timestamp
14
+ if has_no_date?(timestamp)
15
+ parse_with_date(@fallback_date_string + " " + timestamp)
16
+ else
17
+ parse_with_date(timestamp)
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def parse_with_date(timestamp)
25
+ begin
26
+ Time.parse(timestamp)
27
+ rescue ArgumentError
28
+ Time.strptime(timestamp, UNPARSEABLE_BY_DATETIME_PARSE)
29
+ end
30
+ end
31
+
32
+ def has_no_date?(timestamp)
33
+ timestamp.strip =~ NO_DATE
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,3 @@
1
+ module Pipio
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "pipio/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pipio"
8
+ spec.version = Pipio::VERSION
9
+ spec.authors = ["Gabe Berke-Williams"]
10
+ spec.email = "gabe@thoughtbot.com"
11
+ spec.description = "A fast, easy way to parse Pidgin (gaim) logs"
12
+ spec.summary = spec.description
13
+ spec.homepage = "https://github.com/gabebw/pipio"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = Gem::Requirement.new(">= 1.9.2")
22
+
23
+ spec.add_development_dependency("mocha")
24
+ spec.add_development_dependency("rspec", "~> 3.0")
25
+ spec.add_development_dependency("rake")
26
+ spec.add_development_dependency("simplecov")
27
+ end
@@ -0,0 +1,37 @@
1
+ describe Pipio::AliasRegistry do
2
+ it 'keeps track of aliases' do
3
+ alias_registry['My Cool Alias'] = 'screen_name88'
4
+ expect(alias_registry['My Cool Alias']).to eq('screen_name88')
5
+ end
6
+
7
+ it 'finds aliases even when they are queried with an action' do
8
+ alias_registry['My Cool Alias'] = 'screen_name88'
9
+ expect(alias_registry['***My Cool Alias']).to eq('screen_name88')
10
+ end
11
+
12
+ it 'downcases screen names' do
13
+ alias_registry['alias'] = 'UPCASE'
14
+ expect(alias_registry['alias']).to eq('upcase')
15
+ end
16
+
17
+ it 'removes space from screen names' do
18
+ alias_registry['alias'] = 'a space'
19
+ expect(alias_registry['alias']).to eq('aspace')
20
+ end
21
+
22
+ it 'takes a default' do
23
+ alias_registry = Pipio::AliasRegistry.new('default_name')
24
+
25
+ expect(alias_registry['alias']).to eq('default_name')
26
+ end
27
+
28
+ it 'normalizes the default' do
29
+ alias_registry = Pipio::AliasRegistry.new('DEFAULT NAME')
30
+
31
+ expect(alias_registry['alias']).to eq('defaultname')
32
+ end
33
+
34
+ def alias_registry
35
+ @alias_registry ||= Pipio::AliasRegistry.new('default')
36
+ end
37
+ end
@@ -0,0 +1,66 @@
1
+ describe Pipio::Chat do
2
+ describe '#to_s' do
3
+ it 'converts all Messages to strings and joins them' do
4
+ chat = Pipio::Chat.new([:a, 1, 3], metadata)
5
+
6
+ expect(chat.to_s).to eq("a\n1\n3")
7
+ end
8
+ end
9
+
10
+ describe '#messages' do
11
+ it 'returns all messages' do
12
+ chat = Pipio::Chat.new(%w(a b c), metadata)
13
+
14
+ expect(chat.messages).to eq %w(a b c)
15
+ end
16
+ end
17
+
18
+ it 'is enumerable' do
19
+ chat = Pipio::Chat.new(%w(a b c), metadata)
20
+
21
+ expect(chat.map(&:upcase)).to eq(%w(A B C))
22
+ end
23
+
24
+ describe '#their_screen_name' do
25
+ it 'is the screen name of the other person in the chat' do
26
+ chat = Pipio::Chat.new([], metadata(their_screen_name: 'them'))
27
+
28
+ expect(chat.their_screen_name).to eq('them')
29
+ end
30
+ end
31
+
32
+ describe '#my_screen_name' do
33
+ it 'is my screen name' do
34
+ chat = Pipio::Chat.new([], metadata(my_screen_name: 'me'))
35
+
36
+ expect(chat.my_screen_name).to eq('me')
37
+ end
38
+ end
39
+
40
+ describe '#start_time_xmlschema' do
41
+ it 'is the start time of the chat in xmlschema format' do
42
+ time = Time.now
43
+ chat = Pipio::Chat.new([], metadata(start_time: time))
44
+
45
+ expect(chat.start_time_xmlschema).to eq(time.xmlschema)
46
+ end
47
+ end
48
+
49
+ describe '#service' do
50
+ it 'is the chat service' do
51
+ chat = Pipio::Chat.new([], metadata(service: 'icq'))
52
+
53
+ expect(chat.service).to eq('icq')
54
+ end
55
+ end
56
+
57
+ def metadata(options = {})
58
+ data = {
59
+ my_screen_name: 'me',
60
+ their_screen_name: 'them',
61
+ start_time: Time.now,
62
+ service: 'aim'
63
+ }.merge(options)
64
+ Pipio::Metadata.new(data)
65
+ end
66
+ end
@@ -0,0 +1,102 @@
1
+ describe Pipio::Cleaners::HtmlCleaner, ".clean" do
2
+ it "removes html, body, and font tags" do
3
+ clean_text = 'clean'
4
+ dirty_text = %{<html><body><font color="red">#{clean_text}</font></body></html>}
5
+ expect(clean(dirty_text)).to eq(clean_text)
6
+ end
7
+
8
+ it "removes those weird <FONT HSPACE> tags" do
9
+ clean_text = 'clean'
10
+ dirty_text = "&lt;/FONT HSPACE='2'>#{clean_text}"
11
+ expect(clean(dirty_text)).to eq(clean_text)
12
+ end
13
+
14
+ it 'removes \r' do
15
+ clean_text = 'clean'
16
+ dirty_text = [clean_text, clean_text, clean_text].join("\r")
17
+ expect(clean(dirty_text)).to eq(clean_text * 3)
18
+ end
19
+
20
+ it "removes empty lines" do
21
+ clean_text = 'clean'
22
+ dirty_text = "#{clean_text}\n\n"
23
+ expect(clean(dirty_text)).to eq(clean_text)
24
+ end
25
+
26
+ it "replaces newlines with <br/>" do
27
+ clean_text = "<br/>clean"
28
+ dirty_text = "\nclean"
29
+ expect(clean(dirty_text)).to eq(clean_text)
30
+ end
31
+
32
+ it "removes empty links" do
33
+ clean_text = 'clean' * 2
34
+ dirty_text = '<a href="awesomelink"> </a>clean' +
35
+ "<a href='awesomelink'></a>clean"
36
+ expect(clean(dirty_text)).to eq(clean_text)
37
+ end
38
+
39
+ describe "with <span>s" do
40
+ it "removes font-family" do
41
+ clean_text = 'clean'
42
+ dirty_text = %Q{<span style='font-family: Helvetica;'>#{clean_text}</span>}
43
+ expect(clean(dirty_text)).to eq(clean_text)
44
+ end
45
+
46
+ it "removes font-size" do
47
+ clean_text = 'clean'
48
+ dirty_text = %Q{<span style="font-size: 6;">#{clean_text}</span>}
49
+ expect(clean(dirty_text)).to eq(clean_text)
50
+ end
51
+
52
+ it "removes background" do
53
+ clean_text = 'clean'
54
+ dirty_text = %Q{<span style="background: #00afaf;">#{clean_text}</span>}
55
+ expect(clean(dirty_text)).to eq(clean_text)
56
+ end
57
+
58
+ it "removes color=#00000" do
59
+ clean_text = 'clean'
60
+ dirty_text = %Q{<span style="color: #000000;">#{clean_text}</span>}
61
+ expect(clean(dirty_text)).to eq(clean_text)
62
+ end
63
+
64
+ it "does not remove color that is not #00000" do
65
+ dirty_text = %Q{<span style="color: #01ABcdef;">whatever</span>}
66
+ expect(clean(dirty_text)).to eq(dirty_text)
67
+ end
68
+
69
+ it "removes improperly-formatted colors" do
70
+ clean_text = 'clean'
71
+ dirty_text = %Q{<span style="color: #0;">#{clean_text}</span>}
72
+ expect(clean(dirty_text)).to eq(clean_text)
73
+ end
74
+
75
+ it "replaces <em> with italic font-style" do
76
+ text = 'whatever'
77
+ dirty_text = "<em>#{text}</em>"
78
+ clean_text = %Q{<span style="font-style: italic;">#{text}</span>}
79
+ expect(clean(dirty_text)).to eq(clean_text)
80
+ end
81
+
82
+ it "does not modify clean text" do
83
+ expect(clean('clean')).to eq('clean')
84
+ end
85
+
86
+ # This implicitly tests a lot of other things, but they've been tested
87
+ # before this too.
88
+ it "removes a trailing space after style declaration and replaces double quotes" do
89
+ dirty_span_open = "<span style='color: #afaf00; font-size: 14pt; font-weight: bold; '>"
90
+ # Replaced double quotes, removed space before ">"
91
+ clean_span_open = '<span style="color: #afaf00;">'
92
+ text = 'whatever'
93
+ dirty_text = "#{dirty_span_open}#{text}</span>"
94
+ clean_text = "#{clean_span_open}#{text}</span>"
95
+ expect(clean(dirty_text)).to eq(clean_text)
96
+ end
97
+ end
98
+
99
+ def clean(line)
100
+ Pipio::Cleaners::HtmlCleaner.clean(line)
101
+ end
102
+ end