one-hot-dictionary 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/lib/DICTIONARY.rb +123 -0
  3. data/lib/ENCODER.rb +132 -0
  4. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9220f9fc81b30d224a1393ccc7fcd6a2ed103e2725a7cd1d3167c05db377448a
4
+ data.tar.gz: ffe5c90869236274f1d50824a9b9df9ceb0182e237fcb142e1f66fa5406d1e75
5
+ SHA512:
6
+ metadata.gz: 2bbb25ea5ffa11ab111193e827ecc093f5665c85b235971fe31e98044b429eccb0346b71d9d9110773fae3edd91e939781cbb9b61703aee5d4f67d70e6e03dd6
7
+ data.tar.gz: 44488a304307b8f845aba24739eac3c95b9d5bf1a5c5d10ba28d01f627116ef591a8cfe8fac1478a22bd196612ee2d4ef2baeae5a3cf912b74a5608e2e0403b3
data/lib/DICTIONARY.rb ADDED
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/env ruby
2
+ =begin
3
+ Created: 14/09/2023.
4
+ Version: 1.0.0
5
+ =end
6
+ require 'numo/narray'
7
+ include Numo
8
+ =begin
9
+ Dictionary
10
+ =end
11
+ class DICTIONARY < ENCODER
12
+ def init(string, x_dim)
13
+ @x_dim = x_dim
14
+ @frequencyHash = wordFrequency(string)
15
+ @stringToEncoding, @encodingToString = hotEncodeVocabulary(@frequencyHash, x_dim)
16
+ end
17
+ def getFrequencyHash()
18
+ return @frequencyHash
19
+ end
20
+ def getEncodingHashs()
21
+ return @stringToEncoding, @encodingToString
22
+ end
23
+ def wordFrequency(words)
24
+ if words.is_a?(String)
25
+ words = words.split
26
+ end
27
+ frequency = {}
28
+ entries = 0
29
+ for i in 0...words.length()
30
+ if frequency[words[i]] == nil
31
+ frequency[words[i]] = 1
32
+ entries += 1
33
+ else
34
+ frequency[words[i]] = frequency[words[i]] + 1
35
+ end
36
+ end
37
+ return frequency.sort_by {|k, v| v}.reverse
38
+ end
39
+ def hotEncodeVocabulary(hash, maxEntries)
40
+ stringToEncoding = {}
41
+ encodingToString = {}
42
+ binary = 0b1
43
+ hash.each do |key, value|
44
+ stringToEncoding["#{key}"] = binary
45
+ encodingToString[binary] = "#{key}"
46
+ binary = binary << 1
47
+ if (binary) == (0b1 << maxEntries)
48
+ break;
49
+ end
50
+ end
51
+ return stringToEncoding, encodingToString
52
+ end
53
+ def readFileDataArray(fileDataArray, fileIndex, start, input_len, predict_len)
54
+ fileWordArray = (fileDataArray[fileIndex]).split
55
+ return fileWordArray[start, input_len], fileWordArray[(start+predict_len), (input_len)]
56
+ end
57
+ def encodeArray(wordArray, hash=@stringToEncoding)
58
+ if wordArray.instance_of? String
59
+ wordArray = wordArray.split()
60
+ end
61
+ array = Int32.zeros(wordArray.length(), @x_dim)
62
+ for i in 0...wordArray.length()
63
+ word = wordArray[i]
64
+ encoding = hash[word]
65
+ if encoding != nil
66
+ binaryArray = binaryToArray(encoding, @x_dim)
67
+ array[i, true] = binaryArray
68
+ end
69
+ end
70
+ return array
71
+ end
72
+ def binaryToArray(binary, x_dim)
73
+ binaryFormatted = ((0b1 << (x_dim+1) ^ binary*2)).to_s(2)[1, x_dim]
74
+ binaryArray = DFloat.zeros(1, x_dim)
75
+ binaryArray = binaryFormatted.split("").map(&:to_i)
76
+ return binaryArray
77
+ end
78
+ def arrayToBinary(array)
79
+ binaryString = array.to_a.join("")
80
+ binary = binaryString.to_i(2)
81
+ return binary
82
+ end
83
+ def decodeArray(encodedArray, hash=@encodingToString)
84
+ output = ""
85
+ for i in 0...encodedArray.shape()[0]
86
+ decodedWord = hash[arrayToBinary(encodedArray[i, true])]
87
+ if decodedWord != nil
88
+ output += decodedWord + " "
89
+ else
90
+ output += "@ "
91
+ end
92
+ end
93
+ return output
94
+ end
95
+ def decodeArrayByMaximum(encodedArray, hash=@encodingToString)
96
+ output = ""
97
+ for i in 0..encodedArray.shape()[0]-1
98
+ if encodedArray[i, true].to_a.max != 0
99
+ max_element = encodedArray[i, true].to_a.each_with_index.max[1]
100
+ encodedArrayBW = Int32.zeros(encodedArray[i, true].shape())
101
+ encodedArrayBW[max_element] = 1
102
+ decodedWord = hash[arrayToBinary(encodedArrayBW)]
103
+ if decodedWord != nil
104
+ output += decodedWord + " "
105
+ else
106
+ output += "@ "
107
+ end
108
+ else
109
+ output += "$ "
110
+ end
111
+ end
112
+ return output
113
+ end
114
+ def viewHash(hash)
115
+ puts "key -> value"
116
+ entries = 0
117
+ hash.each do |key, value|
118
+ puts key.to_s + " -> " + value.to_s
119
+ entries += 1
120
+ end
121
+ puts "Entires in hash: " + entries.to_s
122
+ end
123
+ end
data/lib/ENCODER.rb ADDED
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env ruby
2
+ =begin
3
+ ██╗░░░░░░██████╗████████╗███╗░░░███╗
4
+ ██║░░░░░██╔════╝╚══██╔══╝████╗░████║
5
+ ██║░░░░░╚█████╗░░░░██║░░░██╔████╔██║
6
+ ██║░░░░░░╚═══██╗░░░██║░░░██║╚██╔╝██║
7
+ ███████╗██████╔╝░░░██║░░░██║░╚═╝░██║
8
+ ╚══════╝╚═════╝░░░░╚═╝░░░╚═╝░░░░░╚═╝
9
+ Created: 14/09/2023.
10
+ Version: 1.0.0
11
+ Author: Ryan May
12
+ This is a text generator LSTM developed in Ruby.
13
+ In this project I gain more of an understanding of the training and creation of LSTM networks.
14
+ This LSTM network utilises the Numo::Narray library to calculate cell states and the cell network.
15
+ Expansion on this project will be utilised in my Honours project.
16
+ =end
17
+ require 'matrix' # https://www.rubyguides.com/2019/01/ruby-matrix/
18
+ require 'numo/narray'
19
+ include Numo
20
+ =begin
21
+ ███████╗███╗░░██╗░█████╗░░█████╗░██████╗░███████╗██████╗░
22
+ ██╔════╝████╗░██║██╔══██╗██╔══██╗██╔══██╗██╔════╝██╔══██╗
23
+ █████╗░░██╔██╗██║██║░░╚═╝██║░░██║██║░░██║█████╗░░██████╔╝
24
+ ██╔══╝░░██║╚████║██║░░██╗██║░░██║██║░░██║██╔══╝░░██╔══██╗
25
+ ███████╗██║░╚███║╚█████╔╝╚█████╔╝██████╔╝███████╗██║░░██║
26
+ ╚══════╝╚═╝░░╚══╝░╚════╝░░╚════╝░╚═════╝░╚══════╝╚═╝░░╚═╝
27
+ The encoder class is used to translate input characters and sentances into one-hot encoded vectors.
28
+ Additionally, the encoder class contains functions to handle file reading, regex filtering, and conversions
29
+ between 'Matrix' and 'NArray' types.
30
+ =end
31
+ class ENCODER
32
+ def init(charmatrix = nil)
33
+ if charmatrix == nil
34
+ @Length = 56
35
+ @SelectionMatrix = Matrix.build(1,56) {0} # 1 row 32 columns
36
+ @CharMatrix = Array['.',',',"\s",'!',
37
+ 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
38
+ 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
39
+ else
40
+ @Length = charmatrix.length()
41
+ @SelectionMatrix = Matrix.build(1, charmatrix.length()) { 0 }
42
+ @CharMatrix = charmatrix
43
+ end
44
+ end
45
+ def matrixToNArray(matrix)
46
+ nArray = DFloat[*matrix.to_a]
47
+ return nArray
48
+ end
49
+ def nArrayToMatrix(nArray)
50
+ #puts nArray.to_a.map(&:inspect)
51
+ matrix = Matrix[*nArray.to_a]
52
+ #puts matrix.to_a.map(&:inspect)
53
+ return matrix
54
+ end
55
+ def listFilesInDirectory(path)
56
+ fileNameArray = Dir.entries(path).reject {|f| File.directory?(f) || f[0].include?('.')}
57
+ return fileNameArray
58
+ end
59
+ def readPlaintextFilesInDirectory(path)
60
+ fileDataArray = Array.new(0) { "" }
61
+ # Get names of all files in a folder.
62
+ fileNameArray = Dir.entries(path).reject {|f| File.directory?(f) || f[0].include?('.')}
63
+ # itterate through all files in Array
64
+ for index in 0...fileNameArray.length()
65
+ puts "reading file " + path + "/" + fileNameArray[index]
66
+ fileData = readPlaintextfile(path + "/" + fileNameArray[index])
67
+ fileDataArray = fileDataArray << fileData
68
+ end
69
+ return fileDataArray
70
+ end
71
+ def readFileDataArray(fileDataArray, index, start, input_len, predict_len)
72
+ combination = hotEncodeSentance(filter(fileDataArray[index])[start,predict_len+input_len])
73
+ return combination
74
+ end
75
+ def readPlaintextfile(fileName)
76
+ file = File.open(fileName)
77
+ file_data = file.read
78
+ file.close
79
+ return file_data
80
+ end
81
+ def filter(sentance, regex=nil)
82
+ if regex == nil
83
+ sentance = sentance.gsub(/[^A-Za-z\., ]/, '')
84
+ else
85
+ sentance = sentance.gsub(regex, '')
86
+ end
87
+ return sentance
88
+ end
89
+ def hotEncodeSentance(sentance)
90
+ letters = sentance.split(//)
91
+ matrix = Matrix.build(0, 0){0}
92
+ for index in 0...letters.length()
93
+ charvector = hotEncodeCharacter(letters[index])
94
+ if charvector != nil
95
+ matrix = Matrix.rows(matrix.to_a << charvector.to_a)
96
+ end
97
+ end
98
+ return matrix
99
+ end
100
+ def hotDecodeSentance(matrix)
101
+ sentance = ""
102
+ for index in 0...matrix.row_count()
103
+ charvector = matrix.row(index)
104
+ char = hotDecodeCharacter(charvector)
105
+ sentance = sentance + char
106
+ end
107
+ return sentance
108
+ end
109
+ def hotEncodeCharacter(char)
110
+ # reset selection matrix
111
+ @SelectionMatrix = Matrix.build(1,@Length) {0}
112
+ index = @CharMatrix.index char
113
+ @SelectionMatrix[0,index] = 1
114
+ return @SelectionMatrix.row(0)
115
+ end
116
+ def hotDecodeCharacter(vector)
117
+ # convert vector to array so we can search for the up bit
118
+ vectorArray = vector.to_a
119
+ index = vectorArray.each_with_index.max[1]
120
+ if index != nil
121
+ return @CharMatrix[index]
122
+ else
123
+ return '@'
124
+ end
125
+ end
126
+ def stringDifferencePercent(a, b)
127
+ longer = [a.size, b.size].max
128
+ same = a.each_char.zip(b.each_char).count { |a,b| a == b }
129
+ similarity = (longer - same) / a.size.to_f
130
+ return similarity
131
+ end
132
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: one-hot-dictionary
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryan May
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-06-29 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: 19477774@student.curtin.edu.au
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/DICTIONARY.rb
20
+ - lib/ENCODER.rb
21
+ homepage: https://github.com/ryan-n-may/Ruby_One_Hot_Dictionary
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.3.15
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: A dictionary gem that one-hot encoddes strings.
44
+ test_files: []