wordgraph 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a0039b72e4ecc4059c72c226860382a4c76ce898b170b8db2c0e31c9ae854dc
4
- data.tar.gz: 05a984e25ff21ddf8620dda415138e8763aeb26f60344eafd9cd9a6d7622a889
3
+ metadata.gz: 45813d9103ca02b5238dbfe12b4aa656da441dcbe66be0fac9e8e46f81b9744a
4
+ data.tar.gz: 21516d98f4aea705561d34c8a23835dc23df1d575369a0bc4a7cd225a863a7b3
5
5
  SHA512:
6
- metadata.gz: bb4bba4db23d7bf136e4d2868923a1b7e6ffd3e8e3e80b1e8cccd3cb413b77df3ff671242592774b862b8daa925641a3325fdfddf345b6c66934372b3f8fa6f8
7
- data.tar.gz: 6f61da0501b80dcd2c799f21da041012533301f1a7f9ac9afd3cf5f96250fba29a153fc72a08b7ffbc393f0a3d93f2b728ad6afe70d9ba55a49689fc358effe9
6
+ metadata.gz: 2170ee77edca8125943b3dac61aee6d0fed2a753b7d3456a7acadc5dd4cd61711663b8fc7c833f329eac5d3db8e9d1f83b2973f3e62bb8a29ad5a011e30df29b
7
+ data.tar.gz: b69f1a188c603e887e145720fd123db9397cb4bca03b6dbb2104b947c5ef351b175f6efd270cf8a7658b22fce68ace1f41194a5069f8079000269e529a23bdf9
data/lib/wordgraph/cli.rb CHANGED
@@ -15,19 +15,46 @@ module Wordgraph
15
15
  opts.separator ""
16
16
  opts.separator "Specific options:"
17
17
 
18
+ opts.on("-h", "--help", "Prints this help") do
19
+ puts opts
20
+ puts args
21
+ end
22
+
18
23
  opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
19
24
  @options[:verbose] = v
20
25
  end
21
26
 
22
- opts.on("-h", "--help", "Prints this help") do
23
- puts opts
24
- puts args
27
+ opts.on("-o=STRING", "--output-directory=STRING", "Directory to output files", String) do |dir|
28
+ # Supports /mnt/[:drive_letter]/[:directory]
29
+ @options[:output_directory] = dir
30
+ end
31
+
32
+ opts.on("-n=STRING", "--name=STRING", "Output file name", String) do |name|
33
+ @options[:name] = name
34
+ end
35
+
36
+ opts.on("--[no-]overwrite", "Overwrite output file") do |overwrite|
37
+ @options[:overwrite] = overwrite
38
+ end
39
+
40
+ opts.on("-s=INTEGER", "--seed=INTEGER", "Deterministic word order", Integer) do |seed|
41
+ @options[:seed] = seed
42
+ end
43
+
44
+ opts.on("-m=INTEGER", "--nlargest=INTEGER", "Maximum amount of words in graph", Integer) do |nlargest|
45
+ @options[:nlargest] = nlargest
46
+ end
47
+
48
+ opts.on("--font=STRING", ["times", "georgia", "garamond", "arial",
49
+ "helvetica", "verdana", "courier", "cursive", "papyrus"],
50
+ "Use a web safe font", String) do |font|
51
+ @options[:font] = font
25
52
  end
26
53
  end
27
54
 
28
55
  begin
29
56
  files = parser.parse!(args)
30
- core = Core.new(files, @options[:verbose])
57
+ core = Core.new(files, **@options)
31
58
  core.process
32
59
  rescue OptionParser::InvalidOption => e
33
60
  puts e.message
@@ -1,28 +1,178 @@
1
+ require_relative "mathwg"
2
+
1
3
  module Wordgraph
2
4
  class Core
3
- def initialize(files, verbose=false)
5
+ def initialize(files, verbose: false, output_directory: Dir.pwd, name: "wordgraph",
6
+ overwrite: false, seed: nil, nlargest: nil, font: "times")
4
7
  @files = files
5
8
  @verbose = verbose
9
+ @output_directory = output_directory
10
+ @name = name
11
+ @overwrite = overwrite
12
+ @seed = seed
13
+ @nlargest = nlargest
14
+ @max_size = 20
15
+ @min_font_size = 12;
16
+ @max_font_size = 48;
17
+ @font = font.downcase
18
+ if @verbose
19
+ puts "Proceeding with settings:"
20
+ self.instance_variables.each do |var|
21
+ puts "#{var}: #{self.instance_variable_get(var) || false}"
22
+ end
23
+ end
6
24
  end
7
25
 
8
- def process
9
- puts "Processing #{@files}" if @verbose
26
+ def tokenize(word)
27
+ # Lowercase
28
+ word = word.downcase
29
+ # Strip trailing punctuation at word start
30
+ word = word.sub(/^[\.,;:!?'"`(\[\{<]+/, "")
31
+ # Strip trailing punctuation at word end
32
+ word = word.sub(/[\.,;:!?'"`)\]\}>]+$/, "")
33
+ return word
34
+ end
35
+
36
+ def generate_cloud(tokens)
37
+ # https://en.wikipedia.org/wiki/Tag_cloud
38
+ raise ArgumentError, "Empty tokens map" unless tokens.length > 0
39
+ # Linear normalization
40
+ # TODO: logarithmic function for larger texts
41
+ min_count = tokens.values.min
42
+ max_count = tokens.values.max
43
+ max_sub_min = [max_count - min_count, 1].max
44
+ tokens = tokens.max_by(@nlargest) { |v| v }.to_h if @nlargest
45
+ tokens.each do |token, count|
46
+ size = count <= min_count ?
47
+ 1 :
48
+ ((@max_size * (count - min_count)) / max_sub_min).ceil
49
+ tokens[token] = {
50
+ count: count,
51
+ size: size
52
+ }
53
+ end
54
+ self.write_html(tokens)
55
+ end
10
56
 
11
- @files.each do |f|
57
+ def get_path
58
+ raise ArgumentError, "Output directory: #{@output_directory} not found" if !Dir.exist?(@output_directory)
59
+ file_name = File.basename(@name) + ".html"
60
+ out = File.join(@output_directory, file_name)
61
+ puts "Creating file: #{out}"
62
+ stop_exists = File.exist?(out) && !@overwrite
63
+ raise ArgumentError, "Output file already exists, either overwrite or change the name" if stop_exists
64
+ return out
65
+ end
66
+
67
+ def remap(from_min, from_max, to_min, to_max, value)
68
+ def lerp(a, b, t)
69
+ return (1 - t) * a + b * t
70
+ end
71
+ def invLerp(a, b, v)
72
+ return a === b ? 0 : (v - a).to_f / (b - a)
73
+ end
74
+ return lerp(to_min, to_max, invLerp(from_min, from_max, value))
75
+ end
76
+
77
+ def write_html(tokens)
78
+ out = self.get_path
79
+ File.open(out, File::RDWR | File::CREAT) do |f|
80
+ f.flock(File::LOCK_EX)
81
+ f.truncate(0)
82
+ document_setup = <<~HTML
83
+ <!DOCTYPE html>
84
+ <html lang="en">
85
+ <head>
86
+ <meta charset="UTF-8">
87
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
88
+ <title>wordgraph</title>
89
+ </head>
90
+ <body>
91
+ #{tokens.to_a.shuffle(random: Random.new(*@seed)).map { |k, v|
92
+ fs = remap(1, @max_size, @min_font_size, @max_font_size, v[:size]).floor
93
+ title = "#{v[:count].to_s} occurrence #{(v[:count] > 1 ) ? "s" : ""}"
94
+ <<~HTML.strip
95
+ <span
96
+ title='#{title}'
97
+ style='font-size: #{fs}px;'
98
+ aria-role='listitem'
99
+ aria-label='#{title}'>#{k}</span>
100
+ HTML
101
+ }.join("\n\s\s")}
102
+ </body>
103
+ <style>
104
+ body {
105
+ display: flex;
106
+ flex-wrap: wrap;
107
+ gap: 8px;
108
+ justify-content: center;
109
+ background-color: #000;
110
+ color: #FFFFFF;
111
+ line-height: 1.6;
112
+ margin: 0;
113
+ padding: 20px;
114
+ font-family: #{@font};
115
+ }
116
+ span {
117
+ display: inline-block;
118
+ margin: 2px 6px;
119
+ padding: 4px 12px;
120
+ border-radius: 4px;
121
+ }
122
+ </style>
123
+ </html>
124
+ HTML
125
+ f.write(document_setup)
126
+ end
127
+ puts File.read(out) if @verbose
128
+ end
129
+
130
+ def process_lines(lines)
131
+ count = {}
132
+ count.default = 0
133
+ lines.each do |line|
134
+ line.split do |word|
135
+ tokenized = self.tokenize(word)
136
+ count[tokenized] += 1
137
+ end
138
+ end
139
+ begin
140
+ puts count if @verbose
141
+ self.generate_cloud(count)
142
+ rescue ArgumentError => e
143
+ raise e
144
+ else
145
+ puts "Succesfully finished processing" if @verbose
146
+ end
147
+ return count
148
+ end
149
+
150
+ def process_text(file)
151
+ puts "Processing txt file #{file}" if @verbose
152
+ lines = IO.readlines(file)
153
+ return self.process_lines(lines)
154
+ end
155
+
156
+ def process_docx(file)
157
+ require "zip"
158
+ puts "Processing docx file #{file}" if @verbose
159
+ binary = File.open(file, 'rb') { |f| f.read }
160
+ Zip::File.open_buffer(binary) do |zip|
161
+ doc = zip.find { |entry| entry.name == 'word/document.xml'}
162
+ text = doc.get_input_stream.read
163
+ # TODO: check if styling is supported
164
+ lines = text.scan(/(?<=<w:t>).+?(?=<\/w:t>)/)
165
+ return self.process_lines(lines)
166
+ end
167
+ end
168
+
169
+ def process
170
+ Array(@files).each do |f|
12
171
  case f
13
172
  when /\.(txt|text)\z/i
14
- puts "Processing txt file #{f}"
15
- count = {}
16
- count.default = 0
17
- IO.foreach(f) do |line|
18
- line.split do |word|
19
- count[word] += 1
20
- end
21
- end
22
- puts count
23
- puts "Succesfully finished processing"
173
+ return self.process_text(f)
24
174
  when /\.docx\z/i
25
- raise ArgumentError, "docx not supported yet."
175
+ return self.process_docx(f)
26
176
  else
27
177
  raise ArgumentError, "File type not supported."
28
178
  end
@@ -0,0 +1,15 @@
1
+ module Wordgraph
2
+ module Mathwg
3
+ def lerp(a, b, t)
4
+ (1 - t) * a + b * t
5
+ end
6
+
7
+ def invLerp(a, b, v)
8
+ a === b ? 0 : (v - a).to_f / (b - a)
9
+ end
10
+
11
+ def remap(from_min, from_max, to_min, to_max, value)
12
+ lerp(to_min, to_max, invLerp(from_min, from_max, value))
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wordgraph
4
- VERSION = "0.5.0"
4
+ VERSION = "0.6.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordgraph
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - marm00
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-01-19 00:00:00.000000000 Z
10
+ date: 2025-01-24 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: optparse
@@ -39,6 +39,7 @@ files:
39
39
  - lib/wordgraph.rb
40
40
  - lib/wordgraph/cli.rb
41
41
  - lib/wordgraph/core.rb
42
+ - lib/wordgraph/mathwg.rb
42
43
  - lib/wordgraph/version.rb
43
44
  - wordgraph.gemspec
44
45
  homepage: https://github.com/marm00/wordgraph