github-linguist 7.30.0 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/grammars/inline.edgeql.json +1 -1
- data/grammars/inline.peggy.json +1 -0
- data/grammars/markdown.move.codeblock.json +1 -1
- data/grammars/mdx.move.codeblock.json +1 -0
- data/grammars/source.abl.json +1 -1
- data/grammars/source.bicep.json +1 -1
- data/grammars/source.cairo.json +1 -1
- data/grammars/source.cairo0.json +1 -0
- data/grammars/source.cl.json +1 -1
- data/grammars/source.clar.json +1 -1
- data/grammars/source.clarion.json +1 -1
- data/grammars/source.cmd.json +1 -1
- data/grammars/source.commonlisp.json +1 -1
- data/grammars/source.cs.json +1 -1
- data/grammars/source.curlrc.json +1 -1
- data/grammars/source.curry.json +1 -1
- data/grammars/source.cylc.json +1 -0
- data/grammars/source.dart.json +1 -1
- data/grammars/source.dds.dspf.json +1 -1
- data/grammars/source.dds.icff.json +1 -1
- data/grammars/source.dds.lf.json +1 -1
- data/grammars/source.dds.pf.json +1 -1
- data/grammars/source.dds.prtf.json +1 -1
- data/grammars/source.dune.json +1 -0
- data/grammars/source.elvish.json +1 -1
- data/grammars/source.fsharp.json +1 -1
- data/grammars/source.gdscript.json +1 -1
- data/grammars/source.gitconfig.json +1 -1
- data/grammars/source.gleam.json +1 -1
- data/grammars/source.hgignore.json +1 -1
- data/grammars/source.hosts.json +1 -1
- data/grammars/source.iCalendar.json +1 -0
- data/grammars/source.ice.json +1 -1
- data/grammars/source.julia.json +1 -1
- data/grammars/source.just.json +1 -1
- data/grammars/source.lcb.json +1 -0
- data/grammars/source.lilypond.json +1 -1
- data/grammars/source.livecodescript.json +1 -0
- data/grammars/source.luau.json +1 -1
- data/grammars/source.matlab.json +1 -1
- data/grammars/source.mcfunction.json +1 -1
- data/grammars/source.mdx.json +1 -1
- data/grammars/source.mo.json +1 -1
- data/grammars/source.move.json +1 -1
- data/grammars/source.nanorc.json +1 -1
- data/grammars/source.nr.json +1 -0
- data/grammars/source.p4.json +1 -1
- data/grammars/source.peggy.json +1 -0
- data/grammars/source.polar.json +1 -1
- data/grammars/source.powerbuilder.json +1 -0
- data/grammars/source.qsharp.json +1 -1
- data/grammars/source.rpgle.json +1 -1
- data/grammars/source.rust.json +1 -1
- data/grammars/source.sentinel.json +1 -1
- data/grammars/source.sourcepawn.json +1 -1
- data/grammars/source.stan.json +1 -1
- data/grammars/source.swift.json +1 -1
- data/grammars/source.sy.json +1 -1
- data/grammars/source.vba.json +1 -1
- data/grammars/source.vcard.json +1 -0
- data/grammars/source.wdl.json +1 -1
- data/grammars/text.adblock.json +1 -1
- data/grammars/text.html.jte.json +1 -0
- data/grammars/text.html.statamic.json +1 -1
- data/grammars/text.md.json +1 -1
- data/grammars/text.mdx.astro.codeblock.json +1 -0
- data/grammars/version +1 -1
- data/lib/linguist/VERSION +1 -1
- data/lib/linguist/classifier.rb +315 -106
- data/lib/linguist/generated.rb +17 -4
- data/lib/linguist/generic.yml +1 -0
- data/lib/linguist/heuristics.rb +6 -6
- data/lib/linguist/heuristics.yml +54 -4
- data/lib/linguist/languages.json +1 -1
- data/lib/linguist/languages.yml +123 -7
- data/lib/linguist/samples.json +1 -1
- data/lib/linguist/samples.rb +9 -1
- data/lib/linguist/sha256.rb +1 -1
- metadata +17 -5
- data/grammars/markdown.mcfunction.codeblock.json +0 -1
- data/grammars/mdx.LANGUAGE.codeblock.json +0 -1
@@ -0,0 +1 @@
|
|
1
|
+
{"scopeName":"text.mdx.astro.codeblock","patterns":[{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(`{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r`])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}},{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(~{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}}],"repository":{"astro-code-block":{"patterns":[{"contentName":"meta.embedded.block.astro.frontmatter","begin":"^\\s*---\\s*$","end":"^\\s*---\\s*$","patterns":[{"include":"source.tsx"}],"beginCaptures":{"0":{"name":"punctuation.definition.tag.xi.begin.t"}},"endCaptures":{"0":{"name":"punctuation.definition.tag.xi.end.t"}}},{"include":"source.astro"}]}}}
|
data/grammars/version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
8.0.0
|
data/lib/linguist/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
8.0.0
|
data/lib/linguist/classifier.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
require 'linguist/tokenizer'
|
2
|
+
require 'set'
|
2
3
|
|
3
4
|
module Linguist
|
4
|
-
# Language
|
5
|
+
# Language content classifier.
|
5
6
|
class Classifier
|
7
|
+
# Maximum number of bytes to consider for classification.
|
8
|
+
# This is only used at evaluation time. During training, full content of
|
9
|
+
# samples is used.
|
6
10
|
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
|
7
11
|
|
8
12
|
# Public: Use the classifier to detect language of the blob.
|
@@ -28,41 +32,59 @@ module Linguist
|
|
28
32
|
#
|
29
33
|
# db - Hash classifier database object
|
30
34
|
# language - String language of data
|
31
|
-
# data - String contents of file
|
35
|
+
# data - String contents of file or array of tokens.
|
32
36
|
#
|
33
37
|
# Examples
|
34
38
|
#
|
35
|
-
# Classifier.train(db, 'Ruby', "def hello; end")
|
39
|
+
# Classifier.train!(db, 'Ruby', "def hello; end")
|
36
40
|
#
|
37
|
-
# Returns
|
41
|
+
# Returns nil.
|
38
42
|
#
|
39
|
-
# Set LINGUIST_DEBUG=1 or =
|
40
|
-
# per-language. See also #dump_all_tokens, below.
|
43
|
+
# Set LINGUIST_DEBUG=1, =2 or =3 to print internal statistics.
|
41
44
|
def self.train!(db, language, data)
|
42
45
|
tokens = data
|
43
46
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
44
47
|
|
45
|
-
|
46
|
-
|
48
|
+
db['vocabulary'] ||= {}
|
49
|
+
# Set hash to autoincremented index value
|
50
|
+
if db['vocabulary'].default_proc.nil?
|
51
|
+
db['vocabulary'].default_proc = proc do |hash, key|
|
52
|
+
hash[key] = hash.length
|
53
|
+
end
|
54
|
+
end
|
47
55
|
|
48
|
-
db['
|
49
|
-
db['
|
50
|
-
db['tokens'] ||= {}
|
51
|
-
db['language_tokens'] ||= {}
|
52
|
-
db['languages'] ||= {}
|
56
|
+
db['samples'] ||= {}
|
57
|
+
db['samples'][language] ||= []
|
53
58
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
59
|
+
termfreq = to_vocabulary_index_termfreq(db['vocabulary'], tokens)
|
60
|
+
db['samples'][language] << termfreq
|
61
|
+
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
|
65
|
+
# Public: Finalize training.
|
66
|
+
#
|
67
|
+
# db - Hash classifier database object
|
68
|
+
#
|
69
|
+
# Examples:
|
70
|
+
# Classifier.finalize_train!(db)
|
71
|
+
#
|
72
|
+
# Returns nil.
|
73
|
+
#
|
74
|
+
# This method must be called after the last #train! call.
|
75
|
+
def self.finalize_train!(db)
|
76
|
+
db['vocabulary'] ||= {}
|
77
|
+
|
78
|
+
# Unset hash autoincrement
|
79
|
+
db['vocabulary'].default_proc = nil
|
65
80
|
|
81
|
+
db['samples'] ||= []
|
82
|
+
filter_vocab_by_freq! db, MIN_DOCUMENT_FREQUENCY
|
83
|
+
sort_vocab! db
|
84
|
+
db['icf'] = inverse_class_freqs db
|
85
|
+
normalize_samples! db
|
86
|
+
db['centroids'] = get_centroids db
|
87
|
+
db.delete 'samples'
|
66
88
|
nil
|
67
89
|
end
|
68
90
|
|
@@ -78,20 +100,17 @@ module Linguist
|
|
78
100
|
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
79
101
|
#
|
80
102
|
# Returns sorted Array of result pairs. Each pair contains the
|
81
|
-
# String language name and a Float score.
|
103
|
+
# String language name and a Float score between 0.0 and 1.0.
|
82
104
|
def self.classify(db, tokens, languages = nil)
|
83
|
-
languages ||= db['
|
105
|
+
languages ||= db['centroids'].keys
|
84
106
|
new(db).classify(tokens, languages)
|
85
107
|
end
|
86
108
|
|
87
109
|
# Internal: Initialize a Classifier.
|
88
110
|
def initialize(db = {})
|
89
|
-
@
|
90
|
-
@
|
91
|
-
@
|
92
|
-
@language_tokens = db['language_tokens']
|
93
|
-
@languages = db['languages']
|
94
|
-
@unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
|
111
|
+
@vocabulary = db['vocabulary']
|
112
|
+
@centroids = db['centroids']
|
113
|
+
@icf = db['icf']
|
95
114
|
end
|
96
115
|
|
97
116
|
# Internal: Guess language of data
|
@@ -100,72 +119,70 @@ module Linguist
|
|
100
119
|
# languages - Array of language name Strings to restrict to.
|
101
120
|
#
|
102
121
|
# Returns sorted Array of result pairs. Each pair contains the
|
103
|
-
# String language name and a Float score.
|
122
|
+
# String language name and a Float score between 0.0 and 1.0.
|
104
123
|
def classify(tokens, languages)
|
105
124
|
return [] if tokens.nil? || languages.empty?
|
106
125
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
107
|
-
scores = {}
|
108
126
|
|
109
|
-
|
127
|
+
debug_dump_tokens(tokens) if verbosity >= 3
|
110
128
|
|
111
|
-
|
112
|
-
|
129
|
+
vec = Classifier.to_vocabulary_index_termfreq_gaps(@vocabulary, tokens)
|
130
|
+
vec.each do |idx, freq|
|
131
|
+
tf = 1.0 + Math.log(freq)
|
132
|
+
vec[idx] = tf * @icf[idx]
|
133
|
+
end
|
134
|
+
return [] if vec.empty?
|
135
|
+
Classifier.l2_normalize!(vec)
|
113
136
|
|
137
|
+
scores = {}
|
114
138
|
languages.each do |language|
|
115
|
-
|
116
|
-
|
139
|
+
centroid = @centroids[language]
|
140
|
+
score = Classifier.similarity(vec, centroid)
|
141
|
+
if score > 0.0
|
142
|
+
scores[language] = score
|
143
|
+
end
|
117
144
|
end
|
118
|
-
|
119
|
-
|
145
|
+
scores = scores.sort_by { |x| -x[1] }
|
146
|
+
debug_dump_all_tokens(tokens, scores) if verbosity >= 2
|
147
|
+
debug_dump_scores(scores) if verbosity >= 1
|
148
|
+
scores
|
120
149
|
end
|
121
150
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
# Returns Float between 0.0 and 1.0.
|
128
|
-
def tokens_probability(counts, language)
|
129
|
-
sum = 0
|
130
|
-
counts.each do |token, count|
|
131
|
-
sum += count * token_probability(token, language)
|
151
|
+
private
|
152
|
+
MIN_DOCUMENT_FREQUENCY = 2
|
153
|
+
|
154
|
+
def verbosity
|
155
|
+
@verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
|
132
156
|
end
|
133
|
-
sum
|
134
|
-
end
|
135
157
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
#
|
141
|
-
# Returns Float.
|
142
|
-
def token_probability(token, language)
|
143
|
-
count = @tokens[language][token]
|
144
|
-
if count.nil? || count == 0
|
145
|
-
# This is usually the most common case, so we cache the result.
|
146
|
-
@unknown_logprob
|
147
|
-
else
|
148
|
-
Math.log(count.to_f / @language_tokens[language].to_f)
|
158
|
+
def debug_dump_scores(scores)
|
159
|
+
headers = ["Language", "Score"]
|
160
|
+
rows = scores.map { |l, s| [l, "%.3f" % s] }
|
161
|
+
dump_table(headers, rows)
|
149
162
|
end
|
150
|
-
end
|
151
163
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
164
|
+
def debug_dump_tokens(tokens)
|
165
|
+
counts = Hash.new(0)
|
166
|
+
tokens.each do |tok|
|
167
|
+
idx = @vocabulary[tok]
|
168
|
+
if not idx.nil?
|
169
|
+
counts[tok] += 1
|
170
|
+
end
|
171
|
+
end
|
160
172
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
173
|
+
norm = Classifier.l2_norm(counts)
|
174
|
+
rows = counts.map do |tok, tf|
|
175
|
+
idx = @vocabulary[tok]
|
176
|
+
log_tf = 1.0 + Math.log(tf)
|
177
|
+
with_icf = log_tf * @icf[idx]
|
178
|
+
normalized = with_icf / norm
|
179
|
+
row = [tok, tf, "%.3f" % log_tf, "%.3f" % with_icf, "%.3f" % normalized]
|
180
|
+
[normalized, row]
|
181
|
+
end
|
165
182
|
|
166
|
-
|
167
|
-
|
168
|
-
|
183
|
+
headers = ["Token", "TF", "log", "*ICF", "L2"]
|
184
|
+
rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
|
185
|
+
dump_table(headers, rows)
|
169
186
|
end
|
170
187
|
|
171
188
|
# Internal: show a table of probabilities for each <token,language> pair.
|
@@ -173,31 +190,223 @@ module Linguist
|
|
173
190
|
# The number in each table entry is the number of "points" that each
|
174
191
|
# token contributes toward the belief that the file under test is a
|
175
192
|
# particular language. Points are additive.
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
193
|
+
def debug_dump_all_tokens(tokens, scores)
|
194
|
+
languages = scores.map { |l, _| l }
|
195
|
+
|
196
|
+
counts = Hash.new(0)
|
197
|
+
tokens.each do |tok|
|
198
|
+
idx = @vocabulary[tok]
|
199
|
+
if not idx.nil?
|
200
|
+
counts[tok] += 1
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
data = {}
|
205
|
+
norm = Classifier.l2_norm(counts)
|
206
|
+
languages.each do |language|
|
207
|
+
data[language] = {}
|
208
|
+
counts.each do |tok, tf|
|
209
|
+
idx = @vocabulary[tok]
|
210
|
+
log_tf = 1.0 + Math.log(tf)
|
211
|
+
with_icf = log_tf * @icf[idx]
|
212
|
+
normalized = with_icf / norm
|
213
|
+
data[language][tok] = normalized * @centroids[language][idx].to_f
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
norm = Classifier.l2_norm(counts)
|
218
|
+
rows = counts.map do |tok, tf|
|
219
|
+
idx = @vocabulary[tok]
|
220
|
+
log_tf = 1.0 + Math.log(tf)
|
221
|
+
with_icf = log_tf * @icf[idx]
|
222
|
+
normalized = with_icf / norm
|
223
|
+
scores = languages.map do |l, _|
|
224
|
+
[l, data[l][tok].to_f]
|
225
|
+
end
|
226
|
+
max_score = scores.to_h.values.max
|
227
|
+
row = [tok] + scores.map do |l, s|
|
228
|
+
if s == max_score
|
229
|
+
"%.4f*" % s
|
230
|
+
elsif s > 0.0
|
231
|
+
"%.4f" % s
|
232
|
+
else
|
233
|
+
"-"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
[normalized, row]
|
237
|
+
end
|
238
|
+
headers = ["Token"] + (0..languages.length-1).map { |lidx| "[#{lidx}]" }
|
239
|
+
rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
|
240
|
+
legend = languages.each_with_index.map { |l, lidx| "[#{lidx}] = #{l}" }
|
241
|
+
dump_table(headers, rows, legend)
|
242
|
+
end
|
243
|
+
|
244
|
+
def dump_table(header, rows, legend = nil)
|
245
|
+
n_cols = header.length
|
246
|
+
rows = rows.map { |r| r.map { |c| c.to_s } }
|
247
|
+
col_widths = (0..n_cols - 1).map do |j|
|
248
|
+
([header[j].length] + rows.map { |row| row[j].length }).max
|
249
|
+
end
|
250
|
+
sep_line = "| #{(0..n_cols-1).map { |j| "-" * col_widths[j] }.join(" | ")} |"
|
251
|
+
content_width = sep_line.length - 4
|
252
|
+
top_line = "| #{"-" * content_width} |"
|
253
|
+
|
254
|
+
format_row = Proc.new do |row|
|
255
|
+
cells = row.zip(col_widths).map do |cell, width|
|
256
|
+
"%-#{width}s" % cell
|
257
|
+
end
|
258
|
+
"| %s |" % cells.join(" | ")
|
259
|
+
end
|
260
|
+
|
261
|
+
puts top_line
|
262
|
+
puts format_row.call(header)
|
263
|
+
puts sep_line
|
264
|
+
rows.each do |row|
|
265
|
+
puts format_row.call(row)
|
266
|
+
end
|
267
|
+
puts top_line
|
268
|
+
if legend
|
269
|
+
legend.each do |line|
|
270
|
+
puts "| %-#{content_width}s |" % line
|
271
|
+
end
|
272
|
+
puts top_line
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def self.to_vocabulary_index_termfreq(vocab, tokens)
|
277
|
+
counts = Hash.new(0)
|
278
|
+
tokens.each do |key|
|
279
|
+
idx = vocab[key]
|
280
|
+
counts[idx] += 1
|
281
|
+
end
|
282
|
+
counts
|
283
|
+
end
|
284
|
+
|
285
|
+
def self.to_vocabulary_index_termfreq_gaps(vocab, tokens)
|
286
|
+
counts = Hash.new(0)
|
287
|
+
tokens.each do |key|
|
288
|
+
if vocab.key? key
|
289
|
+
idx = vocab[key]
|
290
|
+
counts[idx] += 1
|
291
|
+
end
|
292
|
+
end
|
293
|
+
counts
|
294
|
+
end
|
295
|
+
|
296
|
+
def self.l2_norm(vec)
|
297
|
+
norm = vec.values.inject(0.0) { |sum, x| sum + x**2 }
|
298
|
+
Math.sqrt(norm)
|
299
|
+
end
|
300
|
+
|
301
|
+
def self.l2_normalize!(vec)
|
302
|
+
norm = l2_norm(vec)
|
303
|
+
vec.transform_values! do |value|
|
304
|
+
value.to_f / norm
|
305
|
+
end
|
306
|
+
nil
|
307
|
+
end
|
308
|
+
|
309
|
+
def self.similarity(a, b)
|
310
|
+
sum = 0.0
|
311
|
+
a.each_key do |idx|
|
312
|
+
if b.key? idx
|
313
|
+
sum += a[idx] * b[idx]
|
199
314
|
end
|
200
|
-
|
315
|
+
end
|
316
|
+
sum
|
201
317
|
end
|
318
|
+
|
319
|
+
# Filter vocabulary by minimum document frequency.
|
320
|
+
def self.filter_vocab_by_freq!(db, min_freq)
|
321
|
+
vocabulary = db['vocabulary']
|
322
|
+
|
323
|
+
# Get document frequencies
|
324
|
+
docfreq = Array.new(vocabulary.size, 0)
|
325
|
+
db['samples'].each_value do |samples|
|
326
|
+
samples.each do |sample|
|
327
|
+
sample.each_key do |idx|
|
328
|
+
docfreq[idx] += 1
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
vocabulary.select! do |_, idx|
|
334
|
+
docfreq[idx] >= min_freq
|
335
|
+
end
|
336
|
+
|
337
|
+
nil
|
338
|
+
end
|
339
|
+
|
340
|
+
# Sort vocabulary lexicographically.
|
341
|
+
def self.sort_vocab!(db)
|
342
|
+
new_indices = Hash.new { |h,k| h[k] = h.length }
|
343
|
+
db['vocabulary'].sort_by { |x| x[0] }.each do |term, idx|
|
344
|
+
db['vocabulary'][term] = new_indices[idx]
|
345
|
+
end
|
346
|
+
new_indices.default_proc = nil
|
347
|
+
|
348
|
+
db['samples'].transform_values! do |samples|
|
349
|
+
samples.map do |sample|
|
350
|
+
new_sample = {}
|
351
|
+
sample.each do |idx, freq|
|
352
|
+
new_idx = new_indices[idx]
|
353
|
+
if not new_idx.nil?
|
354
|
+
new_sample[new_idx] = freq
|
355
|
+
end
|
356
|
+
end
|
357
|
+
new_sample
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
# Compute inverse class frequency (ICF) for every term.
|
363
|
+
def self.inverse_class_freqs(db)
|
364
|
+
icf = Array.new(db['vocabulary'].size, 0)
|
365
|
+
db['samples'].each_value do |samples|
|
366
|
+
terms = Set.new
|
367
|
+
samples.each do |sample|
|
368
|
+
terms |= sample.keys
|
369
|
+
end
|
370
|
+
terms.each do |idx|
|
371
|
+
icf[idx] += 1
|
372
|
+
end
|
373
|
+
end
|
374
|
+
icf.map! do |val|
|
375
|
+
Math.log(db['samples'].size.to_f / val.to_f) + 1
|
376
|
+
end
|
377
|
+
icf
|
378
|
+
end
|
379
|
+
|
380
|
+
def self.normalize_samples!(db)
|
381
|
+
icf = db['icf']
|
382
|
+
db['samples'].each_value do |samples|
|
383
|
+
samples.each do |sample|
|
384
|
+
sample.each do |idx, freq|
|
385
|
+
tf = 1.0 + Math.log(freq)
|
386
|
+
sample[idx] = tf * icf[idx]
|
387
|
+
end
|
388
|
+
l2_normalize! sample
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
def self.get_centroids(db)
|
394
|
+
centroids = {}
|
395
|
+
db['samples'].each do |language, samples|
|
396
|
+
centroid = Hash.new(0.0)
|
397
|
+
samples.each do |sample|
|
398
|
+
sample.each do |idx, val|
|
399
|
+
centroid[idx] += val
|
400
|
+
end
|
401
|
+
end
|
402
|
+
centroid.each_key do |idx|
|
403
|
+
centroid[idx] = centroid[idx] / samples.length
|
404
|
+
end
|
405
|
+
l2_normalize! centroid
|
406
|
+
centroids[language] = centroid
|
407
|
+
end
|
408
|
+
centroids
|
409
|
+
end
|
410
|
+
|
202
411
|
end
|
203
412
|
end
|
data/lib/linguist/generated.rb
CHANGED
@@ -61,6 +61,7 @@ module Linguist
|
|
61
61
|
composer_lock? ||
|
62
62
|
cargo_lock? ||
|
63
63
|
cargo_orig? ||
|
64
|
+
deno_lock? ||
|
64
65
|
flake_lock? ||
|
65
66
|
bazel_lock? ||
|
66
67
|
node_modules? ||
|
@@ -68,6 +69,7 @@ module Linguist
|
|
68
69
|
go_lock? ||
|
69
70
|
poetry_lock? ||
|
70
71
|
pdm_lock? ||
|
72
|
+
uv_lock? ||
|
71
73
|
esy_lock? ||
|
72
74
|
npm_shrinkwrap_or_package_lock? ||
|
73
75
|
pnpm_lock? ||
|
@@ -422,6 +424,13 @@ module Linguist
|
|
422
424
|
!!name.match(/pdm\.lock/)
|
423
425
|
end
|
424
426
|
|
427
|
+
# Internal: Is the blob a generated uv.lock?
|
428
|
+
#
|
429
|
+
# Returns true or false.
|
430
|
+
def uv_lock?
|
431
|
+
!!name.match(/uv\.lock/)
|
432
|
+
end
|
433
|
+
|
425
434
|
# Internal: Is the blob a generated esy lock file?
|
426
435
|
#
|
427
436
|
# Returns true or false.
|
@@ -429,6 +438,13 @@ module Linguist
|
|
429
438
|
!!name.match(/(^|\/)(\w+\.)?esy.lock$/)
|
430
439
|
end
|
431
440
|
|
441
|
+
# Internal: Is the blob a generated deno lockfile, which are not meant for humans in pull requests.
|
442
|
+
#
|
443
|
+
# Returns true or false.
|
444
|
+
def deno_lock?
|
445
|
+
!!name.match(/deno\.lock/)
|
446
|
+
end
|
447
|
+
|
432
448
|
# Internal: Is the blob a generated npm shrinkwrap or package lock file?
|
433
449
|
#
|
434
450
|
# Returns true or false.
|
@@ -697,14 +713,11 @@ module Linguist
|
|
697
713
|
|
698
714
|
# Internal: Is this a generated Game Maker Studio (2) metadata file?
|
699
715
|
#
|
700
|
-
# All Game Maker Studio 2 generated files will be JSON, .yy or .yyp, and have
|
701
|
-
# a part that looks like "modelName: GMname" on the 3rd line
|
702
|
-
#
|
703
716
|
# Return true or false
|
704
717
|
def generated_gamemakerstudio?
|
705
718
|
return false unless ['.yy', '.yyp'].include? extname
|
706
719
|
return false unless lines.count > 3
|
707
|
-
return lines
|
720
|
+
return lines.first(3).join('').match?(/^\s*[\{\[]/) ||
|
708
721
|
lines[0] =~ /^\d\.\d\.\d.+\|\{/
|
709
722
|
end
|
710
723
|
|
data/lib/linguist/generic.yml
CHANGED
data/lib/linguist/heuristics.rb
CHANGED
@@ -126,7 +126,7 @@ module Linguist
|
|
126
126
|
# Internal: Perform the heuristic
|
127
127
|
def call(data)
|
128
128
|
matched = @rules.find do |rule|
|
129
|
-
rule['pattern'].match(data)
|
129
|
+
rule['pattern'].match?(data)
|
130
130
|
end
|
131
131
|
if !matched.nil?
|
132
132
|
languages = matched['language']
|
@@ -145,14 +145,14 @@ module Linguist
|
|
145
145
|
@pats = pats
|
146
146
|
end
|
147
147
|
|
148
|
-
def match(input)
|
149
|
-
return
|
148
|
+
def match?(input)
|
149
|
+
return @pats.all? { |pat| pat.match?(input) }
|
150
150
|
end
|
151
151
|
|
152
152
|
end
|
153
153
|
|
154
154
|
class AlwaysMatch
|
155
|
-
def match(input)
|
155
|
+
def match?(input)
|
156
156
|
return true
|
157
157
|
end
|
158
158
|
end
|
@@ -163,8 +163,8 @@ module Linguist
|
|
163
163
|
@pat = pat
|
164
164
|
end
|
165
165
|
|
166
|
-
def match(input)
|
167
|
-
return !@pat.match(input)
|
166
|
+
def match?(input)
|
167
|
+
return !@pat.match?(input)
|
168
168
|
end
|
169
169
|
|
170
170
|
end
|