one-hot-dictionary 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/DICTIONARY.rb +123 -0
- data/lib/ENCODER.rb +132 -0
- metadata +44 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 9220f9fc81b30d224a1393ccc7fcd6a2ed103e2725a7cd1d3167c05db377448a
|
|
4
|
+
data.tar.gz: ffe5c90869236274f1d50824a9b9df9ceb0182e237fcb142e1f66fa5406d1e75
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 2bbb25ea5ffa11ab111193e827ecc093f5665c85b235971fe31e98044b429eccb0346b71d9d9110773fae3edd91e939781cbb9b61703aee5d4f67d70e6e03dd6
|
|
7
|
+
data.tar.gz: 44488a304307b8f845aba24739eac3c95b9d5bf1a5c5d10ba28d01f627116ef591a8cfe8fac1478a22bd196612ee2d4ef2baeae5a3cf912b74a5608e2e0403b3
|
data/lib/DICTIONARY.rb
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
=begin
|
|
3
|
+
Created: 14/09/2023.
|
|
4
|
+
Version: 1.0.0
|
|
5
|
+
=end
|
|
6
|
+
require 'numo/narray'
|
|
7
|
+
include Numo
|
|
8
|
+
=begin
|
|
9
|
+
Dictionary
|
|
10
|
+
=end
|
|
11
|
+
class DICTIONARY < ENCODER
|
|
12
|
+
def init(string, x_dim)
|
|
13
|
+
@x_dim = x_dim
|
|
14
|
+
@frequencyHash = wordFrequency(string)
|
|
15
|
+
@stringToEncoding, @encodingToString = hotEncodeVocabulary(@frequencyHash, x_dim)
|
|
16
|
+
end
|
|
17
|
+
def getFrequencyHash()
|
|
18
|
+
return @frequencyHash
|
|
19
|
+
end
|
|
20
|
+
def getEncodingHashs()
|
|
21
|
+
return @stringToEncoding, @encodingToString
|
|
22
|
+
end
|
|
23
|
+
def wordFrequency(words)
|
|
24
|
+
if words.is_a?(String)
|
|
25
|
+
words = words.split
|
|
26
|
+
end
|
|
27
|
+
frequency = {}
|
|
28
|
+
entries = 0
|
|
29
|
+
for i in 0...words.length()
|
|
30
|
+
if frequency[words[i]] == nil
|
|
31
|
+
frequency[words[i]] = 1
|
|
32
|
+
entries += 1
|
|
33
|
+
else
|
|
34
|
+
frequency[words[i]] = frequency[words[i]] + 1
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
return frequency.sort_by {|k, v| v}.reverse
|
|
38
|
+
end
|
|
39
|
+
def hotEncodeVocabulary(hash, maxEntries)
|
|
40
|
+
stringToEncoding = {}
|
|
41
|
+
encodingToString = {}
|
|
42
|
+
binary = 0b1
|
|
43
|
+
hash.each do |key, value|
|
|
44
|
+
stringToEncoding["#{key}"] = binary
|
|
45
|
+
encodingToString[binary] = "#{key}"
|
|
46
|
+
binary = binary << 1
|
|
47
|
+
if (binary) == (0b1 << maxEntries)
|
|
48
|
+
break;
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
return stringToEncoding, encodingToString
|
|
52
|
+
end
|
|
53
|
+
def readFileDataArray(fileDataArray, fileIndex, start, input_len, predict_len)
|
|
54
|
+
fileWordArray = (fileDataArray[fileIndex]).split
|
|
55
|
+
return fileWordArray[start, input_len], fileWordArray[(start+predict_len), (input_len)]
|
|
56
|
+
end
|
|
57
|
+
def encodeArray(wordArray, hash=@stringToEncoding)
|
|
58
|
+
if wordArray.instance_of? String
|
|
59
|
+
wordArray = wordArray.split()
|
|
60
|
+
end
|
|
61
|
+
array = Int32.zeros(wordArray.length(), @x_dim)
|
|
62
|
+
for i in 0...wordArray.length()
|
|
63
|
+
word = wordArray[i]
|
|
64
|
+
encoding = hash[word]
|
|
65
|
+
if encoding != nil
|
|
66
|
+
binaryArray = binaryToArray(encoding, @x_dim)
|
|
67
|
+
array[i, true] = binaryArray
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
return array
|
|
71
|
+
end
|
|
72
|
+
def binaryToArray(binary, x_dim)
|
|
73
|
+
binaryFormatted = ((0b1 << (x_dim+1) ^ binary*2)).to_s(2)[1, x_dim]
|
|
74
|
+
binaryArray = DFloat.zeros(1, x_dim)
|
|
75
|
+
binaryArray = binaryFormatted.split("").map(&:to_i)
|
|
76
|
+
return binaryArray
|
|
77
|
+
end
|
|
78
|
+
def arrayToBinary(array)
|
|
79
|
+
binaryString = array.to_a.join("")
|
|
80
|
+
binary = binaryString.to_i(2)
|
|
81
|
+
return binary
|
|
82
|
+
end
|
|
83
|
+
def decodeArray(encodedArray, hash=@encodingToString)
|
|
84
|
+
output = ""
|
|
85
|
+
for i in 0...encodedArray.shape()[0]
|
|
86
|
+
decodedWord = hash[arrayToBinary(encodedArray[i, true])]
|
|
87
|
+
if decodedWord != nil
|
|
88
|
+
output += decodedWord + " "
|
|
89
|
+
else
|
|
90
|
+
output += "@ "
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
return output
|
|
94
|
+
end
|
|
95
|
+
def decodeArrayByMaximum(encodedArray, hash=@encodingToString)
|
|
96
|
+
output = ""
|
|
97
|
+
for i in 0..encodedArray.shape()[0]-1
|
|
98
|
+
if encodedArray[i, true].to_a.max != 0
|
|
99
|
+
max_element = encodedArray[i, true].to_a.each_with_index.max[1]
|
|
100
|
+
encodedArrayBW = Int32.zeros(encodedArray[i, true].shape())
|
|
101
|
+
encodedArrayBW[max_element] = 1
|
|
102
|
+
decodedWord = hash[arrayToBinary(encodedArrayBW)]
|
|
103
|
+
if decodedWord != nil
|
|
104
|
+
output += decodedWord + " "
|
|
105
|
+
else
|
|
106
|
+
output += "@ "
|
|
107
|
+
end
|
|
108
|
+
else
|
|
109
|
+
output += "$ "
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
return output
|
|
113
|
+
end
|
|
114
|
+
def viewHash(hash)
|
|
115
|
+
puts "key -> value"
|
|
116
|
+
entries = 0
|
|
117
|
+
hash.each do |key, value|
|
|
118
|
+
puts key.to_s + " -> " + value.to_s
|
|
119
|
+
entries += 1
|
|
120
|
+
end
|
|
121
|
+
puts "Entires in hash: " + entries.to_s
|
|
122
|
+
end
|
|
123
|
+
end
|
data/lib/ENCODER.rb
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
=begin
|
|
3
|
+
██╗░░░░░░██████╗████████╗███╗░░░███╗
|
|
4
|
+
██║░░░░░██╔════╝╚══██╔══╝████╗░████║
|
|
5
|
+
██║░░░░░╚█████╗░░░░██║░░░██╔████╔██║
|
|
6
|
+
██║░░░░░░╚═══██╗░░░██║░░░██║╚██╔╝██║
|
|
7
|
+
███████╗██████╔╝░░░██║░░░██║░╚═╝░██║
|
|
8
|
+
╚══════╝╚═════╝░░░░╚═╝░░░╚═╝░░░░░╚═╝
|
|
9
|
+
Created: 14/09/2023.
|
|
10
|
+
Version: 1.0.0
|
|
11
|
+
Author: Ryan May
|
|
12
|
+
This is a text generator LSTM developed in Ruby.
|
|
13
|
+
In this project I gain more of an understanding of the training and creation of LSTM networks.
|
|
14
|
+
This LSTM network utilises the Numo::Narray library to calculate cell states and the cell network.
|
|
15
|
+
Expansion on this project will be utilised in my Honours project.
|
|
16
|
+
=end
|
|
17
|
+
require 'matrix' # https://www.rubyguides.com/2019/01/ruby-matrix/
|
|
18
|
+
require 'numo/narray'
|
|
19
|
+
include Numo
|
|
20
|
+
=begin
|
|
21
|
+
███████╗███╗░░██╗░█████╗░░█████╗░██████╗░███████╗██████╗░
|
|
22
|
+
██╔════╝████╗░██║██╔══██╗██╔══██╗██╔══██╗██╔════╝██╔══██╗
|
|
23
|
+
█████╗░░██╔██╗██║██║░░╚═╝██║░░██║██║░░██║█████╗░░██████╔╝
|
|
24
|
+
██╔══╝░░██║╚████║██║░░██╗██║░░██║██║░░██║██╔══╝░░██╔══██╗
|
|
25
|
+
███████╗██║░╚███║╚█████╔╝╚█████╔╝██████╔╝███████╗██║░░██║
|
|
26
|
+
╚══════╝╚═╝░░╚══╝░╚════╝░░╚════╝░╚═════╝░╚══════╝╚═╝░░╚═╝
|
|
27
|
+
The encoder class is used to translate input characters and sentances into one-hot encoded vectors.
|
|
28
|
+
Additionally, the encoder class contains functions to handle file reading, regex filtering, and conversions
|
|
29
|
+
between 'Matrix' and 'NArray' types.
|
|
30
|
+
=end
|
|
31
|
+
class ENCODER
|
|
32
|
+
def init(charmatrix = nil)
|
|
33
|
+
if charmatrix == nil
|
|
34
|
+
@Length = 56
|
|
35
|
+
@SelectionMatrix = Matrix.build(1,56) {0} # 1 row 32 columns
|
|
36
|
+
@CharMatrix = Array['.',',',"\s",'!',
|
|
37
|
+
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
|
|
38
|
+
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
|
|
39
|
+
else
|
|
40
|
+
@Length = charmatrix.length()
|
|
41
|
+
@SelectionMatrix = Matrix.build(1, charmatrix.length()) { 0 }
|
|
42
|
+
@CharMatrix = charmatrix
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
def matrixToNArray(matrix)
|
|
46
|
+
nArray = DFloat[*matrix.to_a]
|
|
47
|
+
return nArray
|
|
48
|
+
end
|
|
49
|
+
def nArrayToMatrix(nArray)
|
|
50
|
+
#puts nArray.to_a.map(&:inspect)
|
|
51
|
+
matrix = Matrix[*nArray.to_a]
|
|
52
|
+
#puts matrix.to_a.map(&:inspect)
|
|
53
|
+
return matrix
|
|
54
|
+
end
|
|
55
|
+
def listFilesInDirectory(path)
|
|
56
|
+
fileNameArray = Dir.entries(path).reject {|f| File.directory?(f) || f[0].include?('.')}
|
|
57
|
+
return fileNameArray
|
|
58
|
+
end
|
|
59
|
+
def readPlaintextFilesInDirectory(path)
|
|
60
|
+
fileDataArray = Array.new(0) { "" }
|
|
61
|
+
# Get names of all files in a folder.
|
|
62
|
+
fileNameArray = Dir.entries(path).reject {|f| File.directory?(f) || f[0].include?('.')}
|
|
63
|
+
# itterate through all files in Array
|
|
64
|
+
for index in 0...fileNameArray.length()
|
|
65
|
+
puts "reading file " + path + "/" + fileNameArray[index]
|
|
66
|
+
fileData = readPlaintextfile(path + "/" + fileNameArray[index])
|
|
67
|
+
fileDataArray = fileDataArray << fileData
|
|
68
|
+
end
|
|
69
|
+
return fileDataArray
|
|
70
|
+
end
|
|
71
|
+
def readFileDataArray(fileDataArray, index, start, input_len, predict_len)
|
|
72
|
+
combination = hotEncodeSentance(filter(fileDataArray[index])[start,predict_len+input_len])
|
|
73
|
+
return combination
|
|
74
|
+
end
|
|
75
|
+
def readPlaintextfile(fileName)
|
|
76
|
+
file = File.open(fileName)
|
|
77
|
+
file_data = file.read
|
|
78
|
+
file.close
|
|
79
|
+
return file_data
|
|
80
|
+
end
|
|
81
|
+
def filter(sentance, regex=nil)
|
|
82
|
+
if regex == nil
|
|
83
|
+
sentance = sentance.gsub(/[^A-Za-z\., ]/, '')
|
|
84
|
+
else
|
|
85
|
+
sentance = sentance.gsub(regex, '')
|
|
86
|
+
end
|
|
87
|
+
return sentance
|
|
88
|
+
end
|
|
89
|
+
def hotEncodeSentance(sentance)
|
|
90
|
+
letters = sentance.split(//)
|
|
91
|
+
matrix = Matrix.build(0, 0){0}
|
|
92
|
+
for index in 0...letters.length()
|
|
93
|
+
charvector = hotEncodeCharacter(letters[index])
|
|
94
|
+
if charvector != nil
|
|
95
|
+
matrix = Matrix.rows(matrix.to_a << charvector.to_a)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
return matrix
|
|
99
|
+
end
|
|
100
|
+
def hotDecodeSentance(matrix)
|
|
101
|
+
sentance = ""
|
|
102
|
+
for index in 0...matrix.row_count()
|
|
103
|
+
charvector = matrix.row(index)
|
|
104
|
+
char = hotDecodeCharacter(charvector)
|
|
105
|
+
sentance = sentance + char
|
|
106
|
+
end
|
|
107
|
+
return sentance
|
|
108
|
+
end
|
|
109
|
+
def hotEncodeCharacter(char)
|
|
110
|
+
# reset selection matrix
|
|
111
|
+
@SelectionMatrix = Matrix.build(1,@Length) {0}
|
|
112
|
+
index = @CharMatrix.index char
|
|
113
|
+
@SelectionMatrix[0,index] = 1
|
|
114
|
+
return @SelectionMatrix.row(0)
|
|
115
|
+
end
|
|
116
|
+
def hotDecodeCharacter(vector)
|
|
117
|
+
# convert vector to array so we can search for the up bit
|
|
118
|
+
vectorArray = vector.to_a
|
|
119
|
+
index = vectorArray.each_with_index.max[1]
|
|
120
|
+
if index != nil
|
|
121
|
+
return @CharMatrix[index]
|
|
122
|
+
else
|
|
123
|
+
return '@'
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
def stringDifferencePercent(a, b)
|
|
127
|
+
longer = [a.size, b.size].max
|
|
128
|
+
same = a.each_char.zip(b.each_char).count { |a,b| a == b }
|
|
129
|
+
similarity = (longer - same) / a.size.to_f
|
|
130
|
+
return similarity
|
|
131
|
+
end
|
|
132
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: one-hot-dictionary
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ryan May
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2023-06-29 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description:
|
|
14
|
+
email: 19477774@student.curtin.edu.au
|
|
15
|
+
executables: []
|
|
16
|
+
extensions: []
|
|
17
|
+
extra_rdoc_files: []
|
|
18
|
+
files:
|
|
19
|
+
- lib/DICTIONARY.rb
|
|
20
|
+
- lib/ENCODER.rb
|
|
21
|
+
homepage: https://github.com/ryan-n-may/Ruby_One_Hot_Dictionary
|
|
22
|
+
licenses:
|
|
23
|
+
- MIT
|
|
24
|
+
metadata: {}
|
|
25
|
+
post_install_message:
|
|
26
|
+
rdoc_options: []
|
|
27
|
+
require_paths:
|
|
28
|
+
- lib
|
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
35
|
+
requirements:
|
|
36
|
+
- - ">="
|
|
37
|
+
- !ruby/object:Gem::Version
|
|
38
|
+
version: '0'
|
|
39
|
+
requirements: []
|
|
40
|
+
rubygems_version: 3.3.15
|
|
41
|
+
signing_key:
|
|
42
|
+
specification_version: 4
|
|
43
|
+
summary: A dictionary gem that one-hot encoddes strings.
|
|
44
|
+
test_files: []
|