chitchat 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/chitchat.rb ADDED
@@ -0,0 +1,3 @@
1
+ require_relative 'chitchat/data_handler'
2
+ require_relative 'chitchat/text_formatter'
3
+ require_relative 'chitchat/lexical_processor'
@@ -0,0 +1,49 @@
1
+ class DataHandler
2
+ require 'csv'
3
+
4
+ UNIXTIME_COL = 0
5
+ TIMESTAMP_COL = 1
6
+ SENDER_ID_COL = 2
7
+ SENDER_NAME_COL = 3
8
+ MESSAGE_TYPE_COL = 4
9
+ MESSAGE_COL = 5
10
+ MESSAGE_HTML_COL = 6
11
+ PERSON_SAYS = [SENDER_NAME_COL, MESSAGE_COL].freeze
12
+
13
+ def initialize(text_file)
14
+ @text_file = text_file
15
+ end
16
+
17
+ def words_by_person
18
+ @words_by_person ||= parse_words_by_person
19
+ end
20
+
21
+ def all_words
22
+ @all_words ||= parse_all_words
23
+ end
24
+
25
+ private
26
+
27
+ def csv_as_array
28
+ options = { col_sep: ';', quote_char: '"' }
29
+ CSV.read(@text_file, options).drop(1).reject(&:empty?) # Get rid of headers
30
+ end
31
+
32
+ def parse_words_by_person
33
+ words_by_person = delete_empty_messages.group_by { |a| a[0] }.each do |_, v|
34
+ v.map! { |arr| arr.drop(1)[0].split(' ') }.flatten!
35
+ end
36
+ words_by_person.delete_if { |person, _words| person =~ /unknown/ }
37
+ end
38
+
39
+ # TODO: I'm not sure what this does since I last changed it
40
+ def delete_empty_messages
41
+ csv_as_array.map do |arr|
42
+ arr.delete_if.with_index { |_, index| !(PERSON_SAYS.include? index) }
43
+ end
44
+ end
45
+
46
+ def parse_all_words
47
+ csv_as_array.map { |a| a[MESSAGE_COL] }.map(&:split).flatten!
48
+ end
49
+ end
@@ -0,0 +1,34 @@
1
+ class LexicalProcessor
2
+ LEVEL_1_STOP_WORDS = 'dict/level_1_stop_words.txt'.freeze
3
+ LEVEL_2_STOP_WORDS = 'dict/level_2_stop_words.txt'.freeze
4
+
5
+ attr_reader :words
6
+
7
+ def initialize(words)
8
+ @words = words
9
+ end
10
+
11
+ def strip_stop_words!(level: 1, augment: [])
12
+ @words = filter(level).filter @words
13
+ @words -= augment
14
+ end
15
+
16
+ private
17
+
18
+ def filter(level)
19
+ case level
20
+ when 1
21
+ Stopwords::Filter.new stop_words(LEVEL_1_STOP_WORDS)
22
+ when 2
23
+ Stopwords::Filter.new stop_words(LEVEL_2_STOP_WORDS)
24
+ else
25
+ raise ArgumentError, 'Invalid stop word level', caller
26
+ end
27
+ end
28
+
29
+ def stop_words(file)
30
+ lines = []
31
+ File.open(file, 'r') { |f| f.each_line { |line| lines.push(line) } }
32
+ lines.map! { |w| w.delete("\n") }
33
+ end
34
+ end
@@ -0,0 +1,44 @@
1
+ class TextFormatter
2
+ require 'spellingbee'
3
+ require 'stopwords'
4
+
5
+ DICT = 'dict/dict.txt'.freeze
6
+
7
+ attr_reader :words
8
+
9
+ def initialize(words_array, clean: true)
10
+ @s = SpellingBee.new source_text: DICT
11
+ @words = words_array
12
+ make_clean if clean
13
+ end
14
+
15
+ def scrub!(options: { stopword_level: 2,
16
+ custom_words: [],
17
+ spelling_bee: false })
18
+ strip_punctuation!
19
+ remove_numeric!
20
+ correct_spelling! if options[:spelling_bee]
21
+ end
22
+
23
+ private
24
+
25
+ # Strips trailing and leading punctuation
26
+ # aka !oi! will become oi but y!y will be unchanged
27
+ def strip_punctuation!
28
+ @words.map! do |w|
29
+ w.gsub(/^[[:punct:]]/, '').gsub(/[[:punct:]]$/, '')
30
+ end
31
+ end
32
+
33
+ def correct_spelling!
34
+ @words.map! { |w| (@s.correct w).first }
35
+ end
36
+
37
+ def remove_numeric!
38
+ @words.delete_if { |w| w =~ /^[+-]?(\d*\.)?\d+$/ }
39
+ end
40
+
41
+ def make_clean
42
+ @words.map! { |w| w.downcase.strip }
43
+ end
44
+ end
@@ -0,0 +1,3 @@
1
+ unixtime;timestamp;sender_id;sender_name;message_type;message;message_html
2
+ 1403115881045472;"2014-06-18 14:24:41";105519660812379208737;"Ryan Collins";REGULAR_CHAT_MESSAGE;"A sample regular message"
3
+
@@ -0,0 +1,4 @@
1
+ unixtime;timestamp;sender_id;sender_name;message_type;message;message_html
2
+ 1403115881045472;"2014-06-18 14:24:41";105519660812379208737;"Ryan Collins";REGULAR_CHAT_MESSAGE;"A sample message from Ryan"
3
+ 1403115964483918;"2014-06-18 14:26:04";103491988956526132192;"Alex Doliner";REGULAR_CHAT_MESSAGE;"A sample message from Alex"
4
+
@@ -0,0 +1,14 @@
1
+ describe 'data handler' do
2
+ it 'returns a list of isolated words given a gchat csv file' do
3
+ data_handler = DataHandler.new('spec/fixtures/g0.txt')
4
+ expect(data_handler.all_words).to match %w(A sample regular message)
5
+ end
6
+
7
+ it 'returns a list of words per person in gchat convo given a csv file' do
8
+ data_handler = DataHandler.new('spec/fixtures/g1.txt')
9
+ expect(data_handler.words_by_person['Ryan Collins'])
10
+ .to match %w(A sample message from Ryan)
11
+ expect(data_handler.words_by_person['Alex Doliner'])
12
+ .to match %w(A sample message from Alex)
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ describe 'lexical processor' do
2
+ let(:words) { ["we've", 'all', 'been', 'there', 'another', 'day', 'dollar'] }
3
+ it 'returns a list of words without stop words based on default list' do
4
+ lexical_processor = LexicalProcessor.new(words)
5
+ lexical_processor.strip_stop_words!
6
+ expect(lexical_processor.words).to eq %w(another day dollar)
7
+ end
8
+
9
+ it 'excludes custom words' do
10
+ lexical_processor = LexicalProcessor.new(words)
11
+ lexical_processor.strip_stop_words!(augment: ['dollar'])
12
+ expect(lexical_processor.words).to eq %w(another day)
13
+ end
14
+ end
@@ -0,0 +1,22 @@
1
+ describe 'text formatter' do
2
+ it 'returns a list of cleaned words on init by default' do
3
+ text_formatter = TextFormatter.new(['UnFormaTted',
4
+ ' lisT ',
5
+ ' of',
6
+ ' WordS'])
7
+ expect(text_formatter.words).to eq %w(unformatted list of words)
8
+ end
9
+
10
+ it 'returns a list of words on init if cleaning is overriden' do
11
+ words = ['UnFormaTted', ' lisT ', ' of', ' WordS']
12
+ text_formatter = TextFormatter.new(words, clean: false)
13
+ expect(text_formatter.words).to eq words
14
+ end
15
+
16
+ it 'returns a words without leading and trailing punctuation, no numerics' do
17
+ words = ['a', 'message,', 'w!th', '6', 'words!', '!oi!']
18
+ text_formatter = TextFormatter.new(words)
19
+ text_formatter.scrub!
20
+ expect(text_formatter.words).to eq ['a', 'message', 'w!th', 'words', 'oi']
21
+ end
22
+ end
@@ -0,0 +1 @@
1
+ require 'chitchat'
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chitchat
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryan Collins
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-05-30 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Provides an API for working with conversation histories
14
+ email: ryancollins.biz@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - dict/dict.txt
20
+ - dict/level_1_stop_words.txt
21
+ - dict/level_2_stop_words.txt
22
+ - lib/chitchat.rb
23
+ - lib/chitchat/data_handler.rb
24
+ - lib/chitchat/lexical_processor.rb
25
+ - lib/chitchat/text_formatter.rb
26
+ - spec/fixtures/g0.txt
27
+ - spec/fixtures/g1.txt
28
+ - spec/lib/chitchat/data_handler_spec.rb
29
+ - spec/lib/chitchat/lexical_processor_spec.rb
30
+ - spec/lib/chitchat/text_formatter_spec.rb
31
+ - spec/spec_helper.rb
32
+ homepage:
33
+ licenses: []
34
+ metadata: {}
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 2.5.1
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: NLP and parsing for popular instant messaging services
55
+ test_files: []