pipio 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.rspec +2 -0
  4. data/.simplecov +5 -0
  5. data/.travis.yml +12 -0
  6. data/Gemfile +3 -0
  7. data/LICENSE +20 -0
  8. data/NEWS.md +10 -0
  9. data/README.md +88 -0
  10. data/Rakefile +13 -0
  11. data/lib/pipio.rb +34 -0
  12. data/lib/pipio/alias_registry.rb +26 -0
  13. data/lib/pipio/chat.rb +39 -0
  14. data/lib/pipio/cleaners/html_cleaner.rb +95 -0
  15. data/lib/pipio/cleaners/text_cleaner.rb +15 -0
  16. data/lib/pipio/file_reader.rb +29 -0
  17. data/lib/pipio/message_creators/auto_or_xml_message_creator.rb +25 -0
  18. data/lib/pipio/message_creators/event_message_creator.rb +47 -0
  19. data/lib/pipio/message_creators/status_message_creator.rb +19 -0
  20. data/lib/pipio/messages/auto_reply_message.rb +7 -0
  21. data/lib/pipio/messages/event.rb +67 -0
  22. data/lib/pipio/messages/message.rb +23 -0
  23. data/lib/pipio/messages/status_message.rb +26 -0
  24. data/lib/pipio/messages/xml_message.rb +43 -0
  25. data/lib/pipio/metadata.rb +34 -0
  26. data/lib/pipio/metadata_parser.rb +55 -0
  27. data/lib/pipio/parser_factory.rb +32 -0
  28. data/lib/pipio/parsers/basic_parser.rb +83 -0
  29. data/lib/pipio/parsers/html_log_parser.rb +22 -0
  30. data/lib/pipio/parsers/null_parser.rb +9 -0
  31. data/lib/pipio/parsers/text_log_parser.rb +21 -0
  32. data/lib/pipio/tag_balancer.rb +163 -0
  33. data/lib/pipio/time_parser.rb +36 -0
  34. data/lib/pipio/version.rb +3 -0
  35. data/pipio.gemspec +27 -0
  36. data/spec/pipio/alias_registry_spec.rb +37 -0
  37. data/spec/pipio/chat_spec.rb +66 -0
  38. data/spec/pipio/cleaners/html_cleaner_spec.rb +102 -0
  39. data/spec/pipio/cleaners/text_cleaner_spec.rb +29 -0
  40. data/spec/pipio/file_reader_spec.rb +130 -0
  41. data/spec/pipio/messages/auto_reply_message_spec.rb +40 -0
  42. data/spec/pipio/messages/event_spec.rb +41 -0
  43. data/spec/pipio/messages/status_message_spec.rb +43 -0
  44. data/spec/pipio/messages/xml_message_spec.rb +55 -0
  45. data/spec/pipio/metadata_parser_spec.rb +81 -0
  46. data/spec/pipio/metadata_spec.rb +72 -0
  47. data/spec/pipio/parser_factory_spec.rb +31 -0
  48. data/spec/pipio/parsers/html_log_parser_spec.rb +160 -0
  49. data/spec/pipio/parsers/null_parser_spec.rb +13 -0
  50. data/spec/pipio/parsers/text_log_parser_spec.rb +37 -0
  51. data/spec/pipio/tag_balancer_spec.rb +16 -0
  52. data/spec/pipio/time_parser_spec.rb +66 -0
  53. data/spec/pipio_spec.rb +63 -0
  54. data/spec/spec_helper.rb +18 -0
  55. data/spec/support/chat_builder.rb +29 -0
  56. data/spec/support/chat_builder_helpers.rb +41 -0
  57. data/spec/support/file_builder.rb +22 -0
  58. data/spec/support/html_chat_builder.rb +67 -0
  59. data/spec/support/logfiles/2006-12-21.223606.txt +3 -0
  60. data/spec/support/logfiles/2008-01-15.071445-0500PST.htm +5 -0
  61. data/spec/support/logfiles/2008-01-15.071445-0500PST.html +5 -0
  62. data/spec/support/text_chat_builder.rb +21 -0
  63. data/spec/test-output/README.md +1 -0
  64. data/spec/test-output/html_log_output.xml +6 -0
  65. data/spec/test-output/text_log_output.xml +4 -0
  66. metadata +193 -0
@@ -0,0 +1,22 @@
1
+ module Pipio
2
+ class HtmlLogParser
3
+ TIMESTAMP_REGEX = /\((?<timestamp>(?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)/
4
+
5
+ def initialize(source_file_path, user_aliases)
6
+ # @line_regex matches a line in an HTML log file other than the first.
7
+ line_regex = /#{TIMESTAMP_REGEX} ?<b>(?<sn_or_alias>.+?) ?(?<auto_reply>&lt;AUTO-REPLY&gt;)?:?<\/b> ?(?<body>.+)<br ?\/>/o
8
+
9
+ # @line_regex_status matches a status or event line.
10
+ line_regex_status = /#{TIMESTAMP_REGEX} ?<b> (?<body>.+)<\/b><br ?\/>/o
11
+
12
+ cleaner = Cleaners::HtmlCleaner
13
+
14
+ @parser = BasicParser.new(source_file_path, user_aliases, line_regex,
15
+ line_regex_status, cleaner)
16
+ end
17
+
18
+ def parse
19
+ @parser.parse
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module Pipio
2
+ class NullParser
3
+ def initialize(logfile_path, aliases)
4
+ end
5
+
6
+ def parse
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,21 @@
1
+ module Pipio
2
+ class TextLogParser
3
+ TIMESTAMP_REGEX = '\((?<timestamp>\d{1,2}:\d{1,2}:\d{1,2})\)'
4
+
5
+ def initialize(source_file_path, user_aliases)
6
+ # @line_regex matches a line in a text log file other than the first.
7
+ line_regex = /#{TIMESTAMP_REGEX} (?<sn_or_alias>.*?) ?(?<auto_reply><AUTO-REPLY>)?: (?<body>.*)/o
8
+ # @line_regex_status matches a status or event line.
9
+ line_regex_status = /#{TIMESTAMP_REGEX} (?<body>[^:]+)/o
10
+
11
+ cleaner = Cleaners::TextCleaner
12
+
13
+ @parser = BasicParser.new(source_file_path, user_aliases, line_regex,
14
+ line_regex_status, cleaner)
15
+ end
16
+
17
+ def parse
18
+ @parser.parse
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,163 @@
1
+ module Pipio
2
+ # Balances tags of string using a modified stack. Returns a balanced
3
+ # string, but also affects the text passed into it!
4
+ # Use text = balance_tags(text).
5
+
6
+ # From Wordpress's formatting.php; rewritten in Ruby by Gabe
7
+ # Berke-Williams, 2009.
8
+ # Author:: Leonard Lin <leonard@acm.org>
9
+ # License:: GPL v2.0
10
+ # Copyright:: November 4, 2001
11
+ class TagBalancer
12
+ def initialize(text)
13
+ @text = text
14
+
15
+ @tagstack = []
16
+ @stacksize = 0
17
+ @tagqueue = ''
18
+
19
+ # Known single-entity/self-closing tags
20
+ @self_closing_tags = %w(br hr img input meta)
21
+
22
+ # Tags that can be immediately nested within themselves
23
+ @nestable_tags = %w(blockquote div span font)
24
+
25
+ # 1: tagname, with possible leading "/"
26
+ # 2: attributes
27
+ @tag_regex = /<(\/?\w*)\s*([^>]*)>/
28
+ end
29
+
30
+ def balance
31
+ text = @text.dup
32
+ newtext = ''
33
+
34
+ @tagstack = []
35
+ @stacksize = 0
36
+ @tagqueue = ''
37
+
38
+ # WP bug fix for comments - in case you REALLY meant to type '< !--'
39
+ text.gsub!('< !--', '< !--')
40
+
41
+ # WP bug fix for LOVE <3 (and other situations with '<' before a number)
42
+ text.gsub!(/<([0-9]{1})/, '&lt;\1')
43
+
44
+ while ( pos = (text =~ @tag_regex) )
45
+ newtext << @tagqueue
46
+ tag = $1.downcase
47
+ attributes = $2
48
+ matchlen = $~[0].size
49
+
50
+ # clear the shifter
51
+ @tagqueue = ''
52
+ # Pop or Push
53
+ if end_tag?(tag)
54
+ tag.slice!(0,1)
55
+ if too_many_closing_tags?
56
+ tag = ''
57
+ #or close to be safe: tag = '/' << tag
58
+ elsif closing_tag?(tag)
59
+ # if stacktop value == tag close value then pop
60
+ tag = "</#{tag}>" # Close Tag
61
+ @tagstack.pop
62
+ @stacksize -= 1
63
+ else
64
+ # closing tag not at top, search for it
65
+ (@stacksize-1).downto(0) do |j|
66
+ if @tagstack[j] == tag
67
+ # add tag to tagqueue
68
+ ss = @stacksize - 1
69
+ ss.downto(j) do |k|
70
+ @tagqueue << "</#{@tagstack.pop}>"
71
+ @stacksize -= 1
72
+ end
73
+
74
+ break
75
+ end
76
+ end
77
+ tag = ''
78
+ end
79
+ else
80
+ # Begin Tag
81
+
82
+ # Tag Cleaning
83
+ if self_closing_attributes?(attributes) || empty_tag?(tag)
84
+ elsif self_closing_tag?(tag)
85
+ # ElseIf: it's a known single-entity tag but it doesn't close itself, do so
86
+ attributes << '/'
87
+ else
88
+ # Push the tag onto the stack
89
+ # If the top of the stack is the same as the tag we want to push, close previous tag
90
+ if (@stacksize > 0 &&
91
+ ! nestable?(tag) &&
92
+ @tagstack[@stacksize - 1] == tag)
93
+ @tagqueue = "</#{@tagstack.pop}>"
94
+ @stacksize -= 1
95
+ end
96
+ @tagstack.push(tag)
97
+ @stacksize += 1
98
+ end
99
+
100
+ # Attributes
101
+ if attributes != ''
102
+ attributes = ' ' + attributes
103
+ end
104
+ tag = "<#{tag}#{attributes}>"
105
+ #If already queuing a close tag, then put this tag on, too
106
+ if @tagqueue
107
+ @tagqueue << tag
108
+ tag = ''
109
+ end
110
+ end
111
+ newtext << text[0,pos] << tag
112
+ text = text[pos+matchlen, text.length - (pos+matchlen)]
113
+ end
114
+
115
+ # Clear Tag Queue
116
+ newtext << @tagqueue
117
+
118
+ # Add Remaining text
119
+ newtext << text
120
+
121
+ # Empty Stack
122
+ @tagstack.reverse_each do |t|
123
+ newtext << "</#{t}>" # Add remaining tags to close
124
+ end
125
+
126
+ # WP fix for the bug with HTML comments
127
+ newtext.gsub!("< !--", "<!--")
128
+ newtext.gsub!("< !--", "< !--")
129
+
130
+ newtext
131
+ end
132
+
133
+ private
134
+
135
+ def end_tag?(string)
136
+ string[0,1] == "/"
137
+ end
138
+
139
+ def closing_tag?(tag)
140
+ @tagstack[@stacksize - 1] == tag
141
+ end
142
+
143
+ def too_many_closing_tags?
144
+ @stacksize <= 0
145
+ end
146
+
147
+ def self_closing_attributes?(attributes)
148
+ attributes[-1,1] == '/'
149
+ end
150
+
151
+ def empty_tag?(tag)
152
+ tag == ''
153
+ end
154
+
155
+ def self_closing_tag?(tag)
156
+ @self_closing_tags.include?(tag)
157
+ end
158
+
159
+ def nestable?(tag)
160
+ @nestable_tags.include?(tag)
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,36 @@
1
+ module Pipio
2
+ class TimeParser
3
+ NO_DATE = /\A\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?\Z/
4
+
5
+ # 01/22/2008 03:01:45 PM
6
+ UNPARSEABLE_BY_DATETIME_PARSE = '%m/%d/%Y %I:%M:%S %P'
7
+
8
+ def initialize(year, month, day)
9
+ @fallback_date_string = "#{year}-#{month}-#{day}"
10
+ end
11
+
12
+ def parse(timestamp)
13
+ if timestamp
14
+ if has_no_date?(timestamp)
15
+ parse_with_date(@fallback_date_string + " " + timestamp)
16
+ else
17
+ parse_with_date(timestamp)
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def parse_with_date(timestamp)
25
+ begin
26
+ Time.parse(timestamp)
27
+ rescue ArgumentError
28
+ Time.strptime(timestamp, UNPARSEABLE_BY_DATETIME_PARSE)
29
+ end
30
+ end
31
+
32
+ def has_no_date?(timestamp)
33
+ timestamp.strip =~ NO_DATE
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,3 @@
1
+ module Pipio
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "pipio/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pipio"
8
+ spec.version = Pipio::VERSION
9
+ spec.authors = ["Gabe Berke-Williams"]
10
+ spec.email = "gabe@thoughtbot.com"
11
+ spec.description = "A fast, easy way to parse Pidgin (gaim) logs"
12
+ spec.summary = spec.description
13
+ spec.homepage = "https://github.com/gabebw/pipio"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = Gem::Requirement.new(">= 1.9.2")
22
+
23
+ spec.add_development_dependency("mocha")
24
+ spec.add_development_dependency("rspec", "~> 3.0")
25
+ spec.add_development_dependency("rake")
26
+ spec.add_development_dependency("simplecov")
27
+ end
@@ -0,0 +1,37 @@
1
+ describe Pipio::AliasRegistry do
2
+ it 'keeps track of aliases' do
3
+ alias_registry['My Cool Alias'] = 'screen_name88'
4
+ expect(alias_registry['My Cool Alias']).to eq('screen_name88')
5
+ end
6
+
7
+ it 'finds aliases even when they are queried with an action' do
8
+ alias_registry['My Cool Alias'] = 'screen_name88'
9
+ expect(alias_registry['***My Cool Alias']).to eq('screen_name88')
10
+ end
11
+
12
+ it 'downcases screen names' do
13
+ alias_registry['alias'] = 'UPCASE'
14
+ expect(alias_registry['alias']).to eq('upcase')
15
+ end
16
+
17
+ it 'removes space from screen names' do
18
+ alias_registry['alias'] = 'a space'
19
+ expect(alias_registry['alias']).to eq('aspace')
20
+ end
21
+
22
+ it 'takes a default' do
23
+ alias_registry = Pipio::AliasRegistry.new('default_name')
24
+
25
+ expect(alias_registry['alias']).to eq('default_name')
26
+ end
27
+
28
+ it 'normalizes the default' do
29
+ alias_registry = Pipio::AliasRegistry.new('DEFAULT NAME')
30
+
31
+ expect(alias_registry['alias']).to eq('defaultname')
32
+ end
33
+
34
+ def alias_registry
35
+ @alias_registry ||= Pipio::AliasRegistry.new('default')
36
+ end
37
+ end
@@ -0,0 +1,66 @@
1
+ describe Pipio::Chat do
2
+ describe '#to_s' do
3
+ it 'converts all Messages to strings and joins them' do
4
+ chat = Pipio::Chat.new([:a, 1, 3], metadata)
5
+
6
+ expect(chat.to_s).to eq("a\n1\n3")
7
+ end
8
+ end
9
+
10
+ describe '#messages' do
11
+ it 'returns all messages' do
12
+ chat = Pipio::Chat.new(%w(a b c), metadata)
13
+
14
+ expect(chat.messages).to eq %w(a b c)
15
+ end
16
+ end
17
+
18
+ it 'is enumerable' do
19
+ chat = Pipio::Chat.new(%w(a b c), metadata)
20
+
21
+ expect(chat.map(&:upcase)).to eq(%w(A B C))
22
+ end
23
+
24
+ describe '#their_screen_name' do
25
+ it 'is the screen name of the other person in the chat' do
26
+ chat = Pipio::Chat.new([], metadata(their_screen_name: 'them'))
27
+
28
+ expect(chat.their_screen_name).to eq('them')
29
+ end
30
+ end
31
+
32
+ describe '#my_screen_name' do
33
+ it 'is my screen name' do
34
+ chat = Pipio::Chat.new([], metadata(my_screen_name: 'me'))
35
+
36
+ expect(chat.my_screen_name).to eq('me')
37
+ end
38
+ end
39
+
40
+ describe '#start_time_xmlschema' do
41
+ it 'is the start time of the chat in xmlschema format' do
42
+ time = Time.now
43
+ chat = Pipio::Chat.new([], metadata(start_time: time))
44
+
45
+ expect(chat.start_time_xmlschema).to eq(time.xmlschema)
46
+ end
47
+ end
48
+
49
+ describe '#service' do
50
+ it 'is the chat service' do
51
+ chat = Pipio::Chat.new([], metadata(service: 'icq'))
52
+
53
+ expect(chat.service).to eq('icq')
54
+ end
55
+ end
56
+
57
+ def metadata(options = {})
58
+ data = {
59
+ my_screen_name: 'me',
60
+ their_screen_name: 'them',
61
+ start_time: Time.now,
62
+ service: 'aim'
63
+ }.merge(options)
64
+ Pipio::Metadata.new(data)
65
+ end
66
+ end
@@ -0,0 +1,102 @@
1
+ describe Pipio::Cleaners::HtmlCleaner, ".clean" do
2
+ it "removes html, body, and font tags" do
3
+ clean_text = 'clean'
4
+ dirty_text = %{<html><body><font color="red">#{clean_text}</font></body></html>}
5
+ expect(clean(dirty_text)).to eq(clean_text)
6
+ end
7
+
8
+ it "removes those weird <FONT HSPACE> tags" do
9
+ clean_text = 'clean'
10
+ dirty_text = "&lt;/FONT HSPACE='2'>#{clean_text}"
11
+ expect(clean(dirty_text)).to eq(clean_text)
12
+ end
13
+
14
+ it 'removes \r' do
15
+ clean_text = 'clean'
16
+ dirty_text = [clean_text, clean_text, clean_text].join("\r")
17
+ expect(clean(dirty_text)).to eq(clean_text * 3)
18
+ end
19
+
20
+ it "removes empty lines" do
21
+ clean_text = 'clean'
22
+ dirty_text = "#{clean_text}\n\n"
23
+ expect(clean(dirty_text)).to eq(clean_text)
24
+ end
25
+
26
+ it "replaces newlines with <br/>" do
27
+ clean_text = "<br/>clean"
28
+ dirty_text = "\nclean"
29
+ expect(clean(dirty_text)).to eq(clean_text)
30
+ end
31
+
32
+ it "removes empty links" do
33
+ clean_text = 'clean' * 2
34
+ dirty_text = '<a href="awesomelink"> </a>clean' +
35
+ "<a href='awesomelink'></a>clean"
36
+ expect(clean(dirty_text)).to eq(clean_text)
37
+ end
38
+
39
+ describe "with <span>s" do
40
+ it "removes font-family" do
41
+ clean_text = 'clean'
42
+ dirty_text = %Q{<span style='font-family: Helvetica;'>#{clean_text}</span>}
43
+ expect(clean(dirty_text)).to eq(clean_text)
44
+ end
45
+
46
+ it "removes font-size" do
47
+ clean_text = 'clean'
48
+ dirty_text = %Q{<span style="font-size: 6;">#{clean_text}</span>}
49
+ expect(clean(dirty_text)).to eq(clean_text)
50
+ end
51
+
52
+ it "removes background" do
53
+ clean_text = 'clean'
54
+ dirty_text = %Q{<span style="background: #00afaf;">#{clean_text}</span>}
55
+ expect(clean(dirty_text)).to eq(clean_text)
56
+ end
57
+
58
+ it "removes color=#00000" do
59
+ clean_text = 'clean'
60
+ dirty_text = %Q{<span style="color: #000000;">#{clean_text}</span>}
61
+ expect(clean(dirty_text)).to eq(clean_text)
62
+ end
63
+
64
+ it "does not remove color that is not #00000" do
65
+ dirty_text = %Q{<span style="color: #01ABcdef;">whatever</span>}
66
+ expect(clean(dirty_text)).to eq(dirty_text)
67
+ end
68
+
69
+ it "removes improperly-formatted colors" do
70
+ clean_text = 'clean'
71
+ dirty_text = %Q{<span style="color: #0;">#{clean_text}</span>}
72
+ expect(clean(dirty_text)).to eq(clean_text)
73
+ end
74
+
75
+ it "replaces <em> with italic font-style" do
76
+ text = 'whatever'
77
+ dirty_text = "<em>#{text}</em>"
78
+ clean_text = %Q{<span style="font-style: italic;">#{text}</span>}
79
+ expect(clean(dirty_text)).to eq(clean_text)
80
+ end
81
+
82
+ it "does not modify clean text" do
83
+ expect(clean('clean')).to eq('clean')
84
+ end
85
+
86
+ # This implicitly tests a lot of other things, but they've been tested
87
+ # before this too.
88
+ it "removes a trailing space after style declaration and replaces double quotes" do
89
+ dirty_span_open = "<span style='color: #afaf00; font-size: 14pt; font-weight: bold; '>"
90
+ # Replaced double quotes, removed space before ">"
91
+ clean_span_open = '<span style="color: #afaf00;">'
92
+ text = 'whatever'
93
+ dirty_text = "#{dirty_span_open}#{text}</span>"
94
+ clean_text = "#{clean_span_open}#{text}</span>"
95
+ expect(clean(dirty_text)).to eq(clean_text)
96
+ end
97
+ end
98
+
99
+ def clean(line)
100
+ Pipio::Cleaners::HtmlCleaner.clean(line)
101
+ end
102
+ end