chitchat 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/dict/dict.txt +235886 -0
- data/dict/level_1_stop_words.txt +173 -0
- data/dict/level_2_stop_words.txt +665 -0
- data/lib/chitchat.rb +3 -0
- data/lib/chitchat/data_handler.rb +49 -0
- data/lib/chitchat/lexical_processor.rb +34 -0
- data/lib/chitchat/text_formatter.rb +44 -0
- data/spec/fixtures/g0.txt +3 -0
- data/spec/fixtures/g1.txt +4 -0
- data/spec/lib/chitchat/data_handler_spec.rb +14 -0
- data/spec/lib/chitchat/lexical_processor_spec.rb +14 -0
- data/spec/lib/chitchat/text_formatter_spec.rb +22 -0
- data/spec/spec_helper.rb +1 -0
- metadata +55 -0
data/lib/chitchat.rb
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
class DataHandler
|
|
2
|
+
require 'csv'
|
|
3
|
+
|
|
4
|
+
UNIXTIME_COL = 0
|
|
5
|
+
TIMESTAMP_COL = 1
|
|
6
|
+
SENDER_ID_COL = 2
|
|
7
|
+
SENDER_NAME_COL = 3
|
|
8
|
+
MESSAGE_TYPE_COL = 4
|
|
9
|
+
MESSAGE_COL = 5
|
|
10
|
+
MESSAGE_HTML_COL = 6
|
|
11
|
+
PERSON_SAYS = [SENDER_NAME_COL, MESSAGE_COL].freeze
|
|
12
|
+
|
|
13
|
+
def initialize(text_file)
|
|
14
|
+
@text_file = text_file
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def words_by_person
|
|
18
|
+
@words_by_person ||= parse_words_by_person
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def all_words
|
|
22
|
+
@all_words ||= parse_all_words
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def csv_as_array
|
|
28
|
+
options = { col_sep: ';', quote_char: '"' }
|
|
29
|
+
CSV.read(@text_file, options).drop(1).reject(&:empty?) # Get rid of headers
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def parse_words_by_person
|
|
33
|
+
words_by_person = delete_empty_messages.group_by { |a| a[0] }.each do |_, v|
|
|
34
|
+
v.map! { |arr| arr.drop(1)[0].split(' ') }.flatten!
|
|
35
|
+
end
|
|
36
|
+
words_by_person.delete_if { |person, _words| person =~ /unknown/ }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# TODO: I'm not sure what this does since I last changed it
|
|
40
|
+
def delete_empty_messages
|
|
41
|
+
csv_as_array.map do |arr|
|
|
42
|
+
arr.delete_if.with_index { |_, index| !(PERSON_SAYS.include? index) }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def parse_all_words
|
|
47
|
+
csv_as_array.map { |a| a[MESSAGE_COL] }.map(&:split).flatten!
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
class LexicalProcessor
|
|
2
|
+
LEVEL_1_STOP_WORDS = 'dict/level_1_stop_words.txt'.freeze
|
|
3
|
+
LEVEL_2_STOP_WORDS = 'dict/level_2_stop_words.txt'.freeze
|
|
4
|
+
|
|
5
|
+
attr_reader :words
|
|
6
|
+
|
|
7
|
+
def initialize(words)
|
|
8
|
+
@words = words
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def strip_stop_words!(level: 1, augment: [])
|
|
12
|
+
@words = filter(level).filter @words
|
|
13
|
+
@words -= augment
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def filter(level)
|
|
19
|
+
case level
|
|
20
|
+
when 1
|
|
21
|
+
Stopwords::Filter.new stop_words(LEVEL_1_STOP_WORDS)
|
|
22
|
+
when 2
|
|
23
|
+
Stopwords::Filter.new stop_words(LEVEL_2_STOP_WORDS)
|
|
24
|
+
else
|
|
25
|
+
raise ArgumentError, 'Invalid stop word level', caller
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def stop_words(file)
|
|
30
|
+
lines = []
|
|
31
|
+
File.open(file, 'r') { |f| f.each_line { |line| lines.push(line) } }
|
|
32
|
+
lines.map! { |w| w.delete("\n") }
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
class TextFormatter
|
|
2
|
+
require 'spellingbee'
|
|
3
|
+
require 'stopwords'
|
|
4
|
+
|
|
5
|
+
DICT = 'dict/dict.txt'.freeze
|
|
6
|
+
|
|
7
|
+
attr_reader :words
|
|
8
|
+
|
|
9
|
+
def initialize(words_array, clean: true)
|
|
10
|
+
@s = SpellingBee.new source_text: DICT
|
|
11
|
+
@words = words_array
|
|
12
|
+
make_clean if clean
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def scrub!(options: { stopword_level: 2,
|
|
16
|
+
custom_words: [],
|
|
17
|
+
spelling_bee: false })
|
|
18
|
+
strip_punctuation!
|
|
19
|
+
remove_numeric!
|
|
20
|
+
correct_spelling! if options[:spelling_bee]
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
# Strips trailing and leading punctuation
|
|
26
|
+
# aka !oi! will become oi but y!y will be unchanged
|
|
27
|
+
def strip_punctuation!
|
|
28
|
+
@words.map! do |w|
|
|
29
|
+
w.gsub(/^[[:punct:]]/, '').gsub(/[[:punct:]]$/, '')
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def correct_spelling!
|
|
34
|
+
@words.map! { |w| (@s.correct w).first }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def remove_numeric!
|
|
38
|
+
@words.delete_if { |w| w =~ /^[+-]?(\d*\.)?\d+$/ }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def make_clean
|
|
42
|
+
@words.map! { |w| w.downcase.strip }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
unixtime;timestamp;sender_id;sender_name;message_type;message;message_html
|
|
2
|
+
1403115881045472;"2014-06-18 14:24:41";105519660812379208737;"Ryan Collins";REGULAR_CHAT_MESSAGE;"A sample message from Ryan"
|
|
3
|
+
1403115964483918;"2014-06-18 14:26:04";103491988956526132192;"Alex Doliner";REGULAR_CHAT_MESSAGE;"A sample message from Alex"
|
|
4
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
describe 'data handler' do
|
|
2
|
+
it 'returns a list of isolated words given a gchat csv file' do
|
|
3
|
+
data_handler = DataHandler.new('spec/fixtures/g0.txt')
|
|
4
|
+
expect(data_handler.all_words).to match %w(A sample regular message)
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
it 'returns a list of words per person in gchat convo given a csv file' do
|
|
8
|
+
data_handler = DataHandler.new('spec/fixtures/g1.txt')
|
|
9
|
+
expect(data_handler.words_by_person['Ryan Collins'])
|
|
10
|
+
.to match %w(A sample message from Ryan)
|
|
11
|
+
expect(data_handler.words_by_person['Alex Doliner'])
|
|
12
|
+
.to match %w(A sample message from Alex)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
describe 'lexical processor' do
|
|
2
|
+
let(:words) { ["we've", 'all', 'been', 'there', 'another', 'day', 'dollar'] }
|
|
3
|
+
it 'returns a list of words without stop words based on default list' do
|
|
4
|
+
lexical_processor = LexicalProcessor.new(words)
|
|
5
|
+
lexical_processor.strip_stop_words!
|
|
6
|
+
expect(lexical_processor.words).to eq %w(another day dollar)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
it 'excludes custom words' do
|
|
10
|
+
lexical_processor = LexicalProcessor.new(words)
|
|
11
|
+
lexical_processor.strip_stop_words!(augment: ['dollar'])
|
|
12
|
+
expect(lexical_processor.words).to eq %w(another day)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
describe 'text formatter' do
|
|
2
|
+
it 'returns a list of cleaned words on init by default' do
|
|
3
|
+
text_formatter = TextFormatter.new(['UnFormaTted',
|
|
4
|
+
' lisT ',
|
|
5
|
+
' of',
|
|
6
|
+
' WordS'])
|
|
7
|
+
expect(text_formatter.words).to eq %w(unformatted list of words)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
it 'returns a list of words on init if cleaning is overriden' do
|
|
11
|
+
words = ['UnFormaTted', ' lisT ', ' of', ' WordS']
|
|
12
|
+
text_formatter = TextFormatter.new(words, clean: false)
|
|
13
|
+
expect(text_formatter.words).to eq words
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it 'returns a words without leading and trailing punctuation, no numerics' do
|
|
17
|
+
words = ['a', 'message,', 'w!th', '6', 'words!', '!oi!']
|
|
18
|
+
text_formatter = TextFormatter.new(words)
|
|
19
|
+
text_formatter.scrub!
|
|
20
|
+
expect(text_formatter.words).to eq ['a', 'message', 'w!th', 'words', 'oi']
|
|
21
|
+
end
|
|
22
|
+
end
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require 'chitchat'
|
metadata
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: chitchat
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ryan Collins
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2016-05-30 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Provides an API for working with conversation histories
|
|
14
|
+
email: ryancollins.biz@gmail.com
|
|
15
|
+
executables: []
|
|
16
|
+
extensions: []
|
|
17
|
+
extra_rdoc_files: []
|
|
18
|
+
files:
|
|
19
|
+
- dict/dict.txt
|
|
20
|
+
- dict/level_1_stop_words.txt
|
|
21
|
+
- dict/level_2_stop_words.txt
|
|
22
|
+
- lib/chitchat.rb
|
|
23
|
+
- lib/chitchat/data_handler.rb
|
|
24
|
+
- lib/chitchat/lexical_processor.rb
|
|
25
|
+
- lib/chitchat/text_formatter.rb
|
|
26
|
+
- spec/fixtures/g0.txt
|
|
27
|
+
- spec/fixtures/g1.txt
|
|
28
|
+
- spec/lib/chitchat/data_handler_spec.rb
|
|
29
|
+
- spec/lib/chitchat/lexical_processor_spec.rb
|
|
30
|
+
- spec/lib/chitchat/text_formatter_spec.rb
|
|
31
|
+
- spec/spec_helper.rb
|
|
32
|
+
homepage:
|
|
33
|
+
licenses: []
|
|
34
|
+
metadata: {}
|
|
35
|
+
post_install_message:
|
|
36
|
+
rdoc_options: []
|
|
37
|
+
require_paths:
|
|
38
|
+
- lib
|
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
version: '0'
|
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
45
|
+
requirements:
|
|
46
|
+
- - ">="
|
|
47
|
+
- !ruby/object:Gem::Version
|
|
48
|
+
version: '0'
|
|
49
|
+
requirements: []
|
|
50
|
+
rubyforge_project:
|
|
51
|
+
rubygems_version: 2.5.1
|
|
52
|
+
signing_key:
|
|
53
|
+
specification_version: 4
|
|
54
|
+
summary: NLP and parsing for popular instant messaging services
|
|
55
|
+
test_files: []
|