RubyGems - chitchat - Versions diffs - 1.0.0 - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +7 -0
data/dict/dict.txt +235886 -0
data/dict/level_1_stop_words.txt +173 -0
data/dict/level_2_stop_words.txt +665 -0
data/lib/chitchat.rb +3 -0
data/lib/chitchat/data_handler.rb +49 -0
data/lib/chitchat/lexical_processor.rb +34 -0
data/lib/chitchat/text_formatter.rb +44 -0
data/spec/fixtures/g0.txt +3 -0
data/spec/fixtures/g1.txt +4 -0
data/spec/lib/chitchat/data_handler_spec.rb +14 -0
data/spec/lib/chitchat/lexical_processor_spec.rb +14 -0
data/spec/lib/chitchat/text_formatter_spec.rb +22 -0
data/spec/spec_helper.rb +1 -0
metadata +55 -0

data/lib/chitchat.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require_relative 'chitchat/data_handler'
+require_relative 'chitchat/text_formatter'
+require_relative 'chitchat/lexical_processor'

data/lib/chitchat/data_handler.rb ADDED Viewed

@@ -0,0 +1,49 @@
+class DataHandler
+  require 'csv'
+  UNIXTIME_COL = 0
+  TIMESTAMP_COL = 1
+  SENDER_ID_COL = 2
+  SENDER_NAME_COL = 3
+  MESSAGE_TYPE_COL = 4
+  MESSAGE_COL = 5
+  MESSAGE_HTML_COL = 6
+  PERSON_SAYS = [SENDER_NAME_COL, MESSAGE_COL].freeze
+  def initialize(text_file)
+    @text_file = text_file
+  end
+  def words_by_person
+    @words_by_person ||= parse_words_by_person
+  end
+  def all_words
+    @all_words ||= parse_all_words
+  end
+  private
+  def csv_as_array
+    options = { col_sep: ';', quote_char: '"' }
+    CSV.read(@text_file, options).drop(1).reject(&:empty?) # Get rid of headers
+  end
+  def parse_words_by_person
+    words_by_person = delete_empty_messages.group_by { |a| a[0] }.each do |_, v|
+      v.map! { |arr| arr.drop(1)[0].split(' ') }.flatten!
+    end
+    words_by_person.delete_if { |person, _words| person =~ /unknown/ }
+  end
+  # TODO: I'm not sure what this does since I last changed it
+  def delete_empty_messages
+    csv_as_array.map do |arr|
+      arr.delete_if.with_index { |_, index| !(PERSON_SAYS.include? index) }
+    end
+  end
+  def parse_all_words
+    csv_as_array.map { |a| a[MESSAGE_COL] }.map(&:split).flatten!
+  end
+end

data/lib/chitchat/lexical_processor.rb ADDED Viewed

@@ -0,0 +1,34 @@
+class LexicalProcessor
+  LEVEL_1_STOP_WORDS = 'dict/level_1_stop_words.txt'.freeze
+  LEVEL_2_STOP_WORDS = 'dict/level_2_stop_words.txt'.freeze
+  attr_reader :words
+  def initialize(words)
+    @words = words
+  end
+  def strip_stop_words!(level: 1, augment: [])
+    @words = filter(level).filter @words
+    @words -= augment
+  end
+  private
+  def filter(level)
+    case level
+    when 1
+      Stopwords::Filter.new stop_words(LEVEL_1_STOP_WORDS)
+    when 2
+      Stopwords::Filter.new stop_words(LEVEL_2_STOP_WORDS)
+    else
+      raise ArgumentError, 'Invalid stop word level', caller
+    end
+  end
+  def stop_words(file)
+    lines = []
+    File.open(file, 'r') { |f| f.each_line { |line| lines.push(line) } }
+    lines.map! { |w| w.delete("\n") }
+  end
+end

data/lib/chitchat/text_formatter.rb ADDED Viewed

@@ -0,0 +1,44 @@
+class TextFormatter
+  require 'spellingbee'
+  require 'stopwords'
+  DICT = 'dict/dict.txt'.freeze
+  attr_reader :words
+  def initialize(words_array, clean: true)
+    @s = SpellingBee.new source_text: DICT
+    @words = words_array
+    make_clean if clean
+  end
+  def scrub!(options: { stopword_level: 2,
+                        custom_words: [],
+                        spelling_bee: false })
+    strip_punctuation!
+    remove_numeric!
+    correct_spelling! if options[:spelling_bee]
+  end
+  private
+  # Strips trailing and leading punctuation
+  # aka !oi! will become oi but y!y will be unchanged
+  def strip_punctuation!
+    @words.map! do |w|
+      w.gsub(/^[[:punct:]]/, '').gsub(/[[:punct:]]$/, '')
+    end
+  end
+  def correct_spelling!
+    @words.map! { |w| (@s.correct w).first }
+  end
+  def remove_numeric!
+    @words.delete_if { |w| w =~ /^[+-]?(\d*\.)?\d+$/ }
+  end
+  def make_clean
+    @words.map! { |w| w.downcase.strip }
+  end
+end

data/spec/fixtures/g0.txt ADDED Viewed

@@ -0,0 +1,3 @@
+unixtime;timestamp;sender_id;sender_name;message_type;message;message_html
+1403115881045472;"2014-06-18 14:24:41";105519660812379208737;"Ryan Collins";REGULAR_CHAT_MESSAGE;"A sample regular message"

data/spec/fixtures/g1.txt ADDED Viewed

@@ -0,0 +1,4 @@
+unixtime;timestamp;sender_id;sender_name;message_type;message;message_html
+1403115881045472;"2014-06-18 14:24:41";105519660812379208737;"Ryan Collins";REGULAR_CHAT_MESSAGE;"A sample message from Ryan"
+1403115964483918;"2014-06-18 14:26:04";103491988956526132192;"Alex Doliner";REGULAR_CHAT_MESSAGE;"A sample message from Alex"

data/spec/lib/chitchat/data_handler_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+describe 'data handler' do
+  it 'returns a list of isolated words given a gchat csv file' do
+    data_handler = DataHandler.new('spec/fixtures/g0.txt')
+    expect(data_handler.all_words).to match %w(A sample regular message)
+  end
+  it 'returns a list of words per person in gchat convo given a csv file' do
+    data_handler = DataHandler.new('spec/fixtures/g1.txt')
+    expect(data_handler.words_by_person['Ryan Collins'])
+      .to match %w(A sample message from Ryan)
+    expect(data_handler.words_by_person['Alex Doliner'])
+      .to match %w(A sample message from Alex)
+  end
+end

data/spec/lib/chitchat/lexical_processor_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+describe 'lexical processor' do
+  let(:words) { ["we've", 'all', 'been', 'there', 'another', 'day', 'dollar'] }
+  it 'returns a list of words without stop words based on default list' do
+    lexical_processor = LexicalProcessor.new(words)
+    lexical_processor.strip_stop_words!
+    expect(lexical_processor.words).to eq %w(another day dollar)
+  end
+  it 'excludes custom words' do
+    lexical_processor = LexicalProcessor.new(words)
+    lexical_processor.strip_stop_words!(augment: ['dollar'])
+    expect(lexical_processor.words).to eq %w(another day)
+  end
+end

data/spec/lib/chitchat/text_formatter_spec.rb ADDED Viewed

@@ -0,0 +1,22 @@
+describe 'text formatter' do
+  it 'returns a list of cleaned words on init by default' do
+    text_formatter = TextFormatter.new(['UnFormaTted',
+                                        ' lisT ',
+                                        ' of',
+                                        '   WordS'])
+    expect(text_formatter.words).to eq %w(unformatted list of words)
+  end
+  it 'returns a list of words on init if cleaning is overriden' do
+    words = ['UnFormaTted', ' lisT ', ' of', '   WordS']
+    text_formatter = TextFormatter.new(words, clean: false)
+    expect(text_formatter.words).to eq words
+  end
+  it 'returns a words without leading and trailing punctuation, no numerics' do
+    words = ['a', 'message,', 'w!th', '6', 'words!', '!oi!']
+    text_formatter = TextFormatter.new(words)
+    text_formatter.scrub!
+    expect(text_formatter.words).to eq ['a', 'message', 'w!th', 'words', 'oi']
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'chitchat'

metadata ADDED Viewed

@@ -0,0 +1,55 @@
+--- !ruby/object:Gem::Specification
+name: chitchat
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- Ryan Collins
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-05-30 00:00:00.000000000 Z
+dependencies: []
+description: Provides an API for working with conversation histories
+email: ryancollins.biz@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- dict/dict.txt
+- dict/level_1_stop_words.txt
+- dict/level_2_stop_words.txt
+- lib/chitchat.rb
+- lib/chitchat/data_handler.rb
+- lib/chitchat/lexical_processor.rb
+- lib/chitchat/text_formatter.rb
+- spec/fixtures/g0.txt
+- spec/fixtures/g1.txt
+- spec/lib/chitchat/data_handler_spec.rb
+- spec/lib/chitchat/lexical_processor_spec.rb
+- spec/lib/chitchat/text_formatter_spec.rb
+- spec/spec_helper.rb
+homepage:
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.1
+signing_key:
+specification_version: 4
+summary: NLP and parsing for popular instant messaging services
+test_files: []

chitchat 1.0.0