github-linguist 7.30.0 → 8.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/grammars/inline.edgeql.json +1 -1
- data/grammars/inline.peggy.json +1 -0
- data/grammars/markdown.move.codeblock.json +1 -1
- data/grammars/mdx.move.codeblock.json +1 -0
- data/grammars/source.abl.json +1 -1
- data/grammars/source.bicep.json +1 -1
- data/grammars/source.cairo.json +1 -1
- data/grammars/source.cairo0.json +1 -0
- data/grammars/source.cl.json +1 -1
- data/grammars/source.clar.json +1 -1
- data/grammars/source.clarion.json +1 -1
- data/grammars/source.cmd.json +1 -1
- data/grammars/source.commonlisp.json +1 -1
- data/grammars/source.cs.json +1 -1
- data/grammars/source.curlrc.json +1 -1
- data/grammars/source.curry.json +1 -1
- data/grammars/source.cylc.json +1 -0
- data/grammars/source.dart.json +1 -1
- data/grammars/source.dds.dspf.json +1 -1
- data/grammars/source.dds.icff.json +1 -1
- data/grammars/source.dds.lf.json +1 -1
- data/grammars/source.dds.pf.json +1 -1
- data/grammars/source.dds.prtf.json +1 -1
- data/grammars/source.dune.json +1 -0
- data/grammars/source.elvish.json +1 -1
- data/grammars/source.fsharp.json +1 -1
- data/grammars/source.gdscript.json +1 -1
- data/grammars/source.gitconfig.json +1 -1
- data/grammars/source.gleam.json +1 -1
- data/grammars/source.hgignore.json +1 -1
- data/grammars/source.hosts.json +1 -1
- data/grammars/source.iCalendar.json +1 -0
- data/grammars/source.ice.json +1 -1
- data/grammars/source.julia.json +1 -1
- data/grammars/source.just.json +1 -1
- data/grammars/source.lcb.json +1 -0
- data/grammars/source.lilypond.json +1 -1
- data/grammars/source.livecodescript.json +1 -0
- data/grammars/source.luau.json +1 -1
- data/grammars/source.matlab.json +1 -1
- data/grammars/source.mcfunction.json +1 -1
- data/grammars/source.mdx.json +1 -1
- data/grammars/source.mo.json +1 -1
- data/grammars/source.move.json +1 -1
- data/grammars/source.nanorc.json +1 -1
- data/grammars/source.nr.json +1 -0
- data/grammars/source.p4.json +1 -1
- data/grammars/source.peggy.json +1 -0
- data/grammars/source.polar.json +1 -1
- data/grammars/source.powerbuilder.json +1 -0
- data/grammars/source.qsharp.json +1 -1
- data/grammars/source.rpgle.json +1 -1
- data/grammars/source.rust.json +1 -1
- data/grammars/source.sentinel.json +1 -1
- data/grammars/source.sourcepawn.json +1 -1
- data/grammars/source.stan.json +1 -1
- data/grammars/source.swift.json +1 -1
- data/grammars/source.sy.json +1 -1
- data/grammars/source.vba.json +1 -1
- data/grammars/source.vcard.json +1 -0
- data/grammars/source.wdl.json +1 -1
- data/grammars/text.adblock.json +1 -1
- data/grammars/text.html.jte.json +1 -0
- data/grammars/text.html.statamic.json +1 -1
- data/grammars/text.md.json +1 -1
- data/grammars/text.mdx.astro.codeblock.json +1 -0
- data/grammars/version +1 -1
- data/lib/linguist/VERSION +1 -1
- data/lib/linguist/classifier.rb +315 -106
- data/lib/linguist/generated.rb +17 -4
- data/lib/linguist/generic.yml +1 -0
- data/lib/linguist/heuristics.rb +6 -6
- data/lib/linguist/heuristics.yml +54 -4
- data/lib/linguist/languages.json +1 -1
- data/lib/linguist/languages.yml +123 -7
- data/lib/linguist/samples.json +1 -1
- data/lib/linguist/samples.rb +9 -1
- data/lib/linguist/sha256.rb +1 -1
- metadata +17 -5
- data/grammars/markdown.mcfunction.codeblock.json +0 -1
- data/grammars/mdx.LANGUAGE.codeblock.json +0 -1
@@ -0,0 +1 @@
|
|
1
|
+
{"scopeName":"text.mdx.astro.codeblock","patterns":[{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(`{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r`])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}},{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(~{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}}],"repository":{"astro-code-block":{"patterns":[{"contentName":"meta.embedded.block.astro.frontmatter","begin":"^\\s*---\\s*$","end":"^\\s*---\\s*$","patterns":[{"include":"source.tsx"}],"beginCaptures":{"0":{"name":"punctuation.definition.tag.xi.begin.t"}},"endCaptures":{"0":{"name":"punctuation.definition.tag.xi.end.t"}}},{"include":"source.astro"}]}}}
|
data/grammars/version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
8.0.0
|
data/lib/linguist/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
8.0.0
|
data/lib/linguist/classifier.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
require 'linguist/tokenizer'
|
2
|
+
require 'set'
|
2
3
|
|
3
4
|
module Linguist
|
4
|
-
# Language
|
5
|
+
# Language content classifier.
|
5
6
|
class Classifier
|
7
|
+
# Maximum number of bytes to consider for classification.
|
8
|
+
# This is only used at evaluation time. During training, full content of
|
9
|
+
# samples is used.
|
6
10
|
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
|
7
11
|
|
8
12
|
# Public: Use the classifier to detect language of the blob.
|
@@ -28,41 +32,59 @@ module Linguist
|
|
28
32
|
#
|
29
33
|
# db - Hash classifier database object
|
30
34
|
# language - String language of data
|
31
|
-
# data - String contents of file
|
35
|
+
# data - String contents of file or array of tokens.
|
32
36
|
#
|
33
37
|
# Examples
|
34
38
|
#
|
35
|
-
# Classifier.train(db, 'Ruby', "def hello; end")
|
39
|
+
# Classifier.train!(db, 'Ruby', "def hello; end")
|
36
40
|
#
|
37
|
-
# Returns
|
41
|
+
# Returns nil.
|
38
42
|
#
|
39
|
-
# Set LINGUIST_DEBUG=1 or =
|
40
|
-
# per-language. See also #dump_all_tokens, below.
|
43
|
+
# Set LINGUIST_DEBUG=1, =2 or =3 to print internal statistics.
|
41
44
|
def self.train!(db, language, data)
|
42
45
|
tokens = data
|
43
46
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
44
47
|
|
45
|
-
|
46
|
-
|
48
|
+
db['vocabulary'] ||= {}
|
49
|
+
# Set hash to autoincremented index value
|
50
|
+
if db['vocabulary'].default_proc.nil?
|
51
|
+
db['vocabulary'].default_proc = proc do |hash, key|
|
52
|
+
hash[key] = hash.length
|
53
|
+
end
|
54
|
+
end
|
47
55
|
|
48
|
-
db['
|
49
|
-
db['
|
50
|
-
db['tokens'] ||= {}
|
51
|
-
db['language_tokens'] ||= {}
|
52
|
-
db['languages'] ||= {}
|
56
|
+
db['samples'] ||= {}
|
57
|
+
db['samples'][language] ||= []
|
53
58
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
59
|
+
termfreq = to_vocabulary_index_termfreq(db['vocabulary'], tokens)
|
60
|
+
db['samples'][language] << termfreq
|
61
|
+
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
|
65
|
+
# Public: Finalize training.
|
66
|
+
#
|
67
|
+
# db - Hash classifier database object
|
68
|
+
#
|
69
|
+
# Examples:
|
70
|
+
# Classifier.finalize_train!(db)
|
71
|
+
#
|
72
|
+
# Returns nil.
|
73
|
+
#
|
74
|
+
# This method must be called after the last #train! call.
|
75
|
+
def self.finalize_train!(db)
|
76
|
+
db['vocabulary'] ||= {}
|
77
|
+
|
78
|
+
# Unset hash autoincrement
|
79
|
+
db['vocabulary'].default_proc = nil
|
65
80
|
|
81
|
+
db['samples'] ||= []
|
82
|
+
filter_vocab_by_freq! db, MIN_DOCUMENT_FREQUENCY
|
83
|
+
sort_vocab! db
|
84
|
+
db['icf'] = inverse_class_freqs db
|
85
|
+
normalize_samples! db
|
86
|
+
db['centroids'] = get_centroids db
|
87
|
+
db.delete 'samples'
|
66
88
|
nil
|
67
89
|
end
|
68
90
|
|
@@ -78,20 +100,17 @@ module Linguist
|
|
78
100
|
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
79
101
|
#
|
80
102
|
# Returns sorted Array of result pairs. Each pair contains the
|
81
|
-
# String language name and a Float score.
|
103
|
+
# String language name and a Float score between 0.0 and 1.0.
|
82
104
|
def self.classify(db, tokens, languages = nil)
|
83
|
-
languages ||= db['
|
105
|
+
languages ||= db['centroids'].keys
|
84
106
|
new(db).classify(tokens, languages)
|
85
107
|
end
|
86
108
|
|
87
109
|
# Internal: Initialize a Classifier.
|
88
110
|
def initialize(db = {})
|
89
|
-
@
|
90
|
-
@
|
91
|
-
@
|
92
|
-
@language_tokens = db['language_tokens']
|
93
|
-
@languages = db['languages']
|
94
|
-
@unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
|
111
|
+
@vocabulary = db['vocabulary']
|
112
|
+
@centroids = db['centroids']
|
113
|
+
@icf = db['icf']
|
95
114
|
end
|
96
115
|
|
97
116
|
# Internal: Guess language of data
|
@@ -100,72 +119,70 @@ module Linguist
|
|
100
119
|
# languages - Array of language name Strings to restrict to.
|
101
120
|
#
|
102
121
|
# Returns sorted Array of result pairs. Each pair contains the
|
103
|
-
# String language name and a Float score.
|
122
|
+
# String language name and a Float score between 0.0 and 1.0.
|
104
123
|
def classify(tokens, languages)
|
105
124
|
return [] if tokens.nil? || languages.empty?
|
106
125
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
107
|
-
scores = {}
|
108
126
|
|
109
|
-
|
127
|
+
debug_dump_tokens(tokens) if verbosity >= 3
|
110
128
|
|
111
|
-
|
112
|
-
|
129
|
+
vec = Classifier.to_vocabulary_index_termfreq_gaps(@vocabulary, tokens)
|
130
|
+
vec.each do |idx, freq|
|
131
|
+
tf = 1.0 + Math.log(freq)
|
132
|
+
vec[idx] = tf * @icf[idx]
|
133
|
+
end
|
134
|
+
return [] if vec.empty?
|
135
|
+
Classifier.l2_normalize!(vec)
|
113
136
|
|
137
|
+
scores = {}
|
114
138
|
languages.each do |language|
|
115
|
-
|
116
|
-
|
139
|
+
centroid = @centroids[language]
|
140
|
+
score = Classifier.similarity(vec, centroid)
|
141
|
+
if score > 0.0
|
142
|
+
scores[language] = score
|
143
|
+
end
|
117
144
|
end
|
118
|
-
|
119
|
-
|
145
|
+
scores = scores.sort_by { |x| -x[1] }
|
146
|
+
debug_dump_all_tokens(tokens, scores) if verbosity >= 2
|
147
|
+
debug_dump_scores(scores) if verbosity >= 1
|
148
|
+
scores
|
120
149
|
end
|
121
150
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
# Returns Float between 0.0 and 1.0.
|
128
|
-
def tokens_probability(counts, language)
|
129
|
-
sum = 0
|
130
|
-
counts.each do |token, count|
|
131
|
-
sum += count * token_probability(token, language)
|
151
|
+
private
|
152
|
+
MIN_DOCUMENT_FREQUENCY = 2
|
153
|
+
|
154
|
+
def verbosity
|
155
|
+
@verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
|
132
156
|
end
|
133
|
-
sum
|
134
|
-
end
|
135
157
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
#
|
141
|
-
# Returns Float.
|
142
|
-
def token_probability(token, language)
|
143
|
-
count = @tokens[language][token]
|
144
|
-
if count.nil? || count == 0
|
145
|
-
# This is usually the most common case, so we cache the result.
|
146
|
-
@unknown_logprob
|
147
|
-
else
|
148
|
-
Math.log(count.to_f / @language_tokens[language].to_f)
|
158
|
+
def debug_dump_scores(scores)
|
159
|
+
headers = ["Language", "Score"]
|
160
|
+
rows = scores.map { |l, s| [l, "%.3f" % s] }
|
161
|
+
dump_table(headers, rows)
|
149
162
|
end
|
150
|
-
end
|
151
163
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
164
|
+
def debug_dump_tokens(tokens)
|
165
|
+
counts = Hash.new(0)
|
166
|
+
tokens.each do |tok|
|
167
|
+
idx = @vocabulary[tok]
|
168
|
+
if not idx.nil?
|
169
|
+
counts[tok] += 1
|
170
|
+
end
|
171
|
+
end
|
160
172
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
173
|
+
norm = Classifier.l2_norm(counts)
|
174
|
+
rows = counts.map do |tok, tf|
|
175
|
+
idx = @vocabulary[tok]
|
176
|
+
log_tf = 1.0 + Math.log(tf)
|
177
|
+
with_icf = log_tf * @icf[idx]
|
178
|
+
normalized = with_icf / norm
|
179
|
+
row = [tok, tf, "%.3f" % log_tf, "%.3f" % with_icf, "%.3f" % normalized]
|
180
|
+
[normalized, row]
|
181
|
+
end
|
165
182
|
|
166
|
-
|
167
|
-
|
168
|
-
|
183
|
+
headers = ["Token", "TF", "log", "*ICF", "L2"]
|
184
|
+
rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
|
185
|
+
dump_table(headers, rows)
|
169
186
|
end
|
170
187
|
|
171
188
|
# Internal: show a table of probabilities for each <token,language> pair.
|
@@ -173,31 +190,223 @@ module Linguist
|
|
173
190
|
# The number in each table entry is the number of "points" that each
|
174
191
|
# token contributes toward the belief that the file under test is a
|
175
192
|
# particular language. Points are additive.
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
193
|
+
def debug_dump_all_tokens(tokens, scores)
|
194
|
+
languages = scores.map { |l, _| l }
|
195
|
+
|
196
|
+
counts = Hash.new(0)
|
197
|
+
tokens.each do |tok|
|
198
|
+
idx = @vocabulary[tok]
|
199
|
+
if not idx.nil?
|
200
|
+
counts[tok] += 1
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
data = {}
|
205
|
+
norm = Classifier.l2_norm(counts)
|
206
|
+
languages.each do |language|
|
207
|
+
data[language] = {}
|
208
|
+
counts.each do |tok, tf|
|
209
|
+
idx = @vocabulary[tok]
|
210
|
+
log_tf = 1.0 + Math.log(tf)
|
211
|
+
with_icf = log_tf * @icf[idx]
|
212
|
+
normalized = with_icf / norm
|
213
|
+
data[language][tok] = normalized * @centroids[language][idx].to_f
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
norm = Classifier.l2_norm(counts)
|
218
|
+
rows = counts.map do |tok, tf|
|
219
|
+
idx = @vocabulary[tok]
|
220
|
+
log_tf = 1.0 + Math.log(tf)
|
221
|
+
with_icf = log_tf * @icf[idx]
|
222
|
+
normalized = with_icf / norm
|
223
|
+
scores = languages.map do |l, _|
|
224
|
+
[l, data[l][tok].to_f]
|
225
|
+
end
|
226
|
+
max_score = scores.to_h.values.max
|
227
|
+
row = [tok] + scores.map do |l, s|
|
228
|
+
if s == max_score
|
229
|
+
"%.4f*" % s
|
230
|
+
elsif s > 0.0
|
231
|
+
"%.4f" % s
|
232
|
+
else
|
233
|
+
"-"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
[normalized, row]
|
237
|
+
end
|
238
|
+
headers = ["Token"] + (0..languages.length-1).map { |lidx| "[#{lidx}]" }
|
239
|
+
rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
|
240
|
+
legend = languages.each_with_index.map { |l, lidx| "[#{lidx}] = #{l}" }
|
241
|
+
dump_table(headers, rows, legend)
|
242
|
+
end
|
243
|
+
|
244
|
+
def dump_table(header, rows, legend = nil)
|
245
|
+
n_cols = header.length
|
246
|
+
rows = rows.map { |r| r.map { |c| c.to_s } }
|
247
|
+
col_widths = (0..n_cols - 1).map do |j|
|
248
|
+
([header[j].length] + rows.map { |row| row[j].length }).max
|
249
|
+
end
|
250
|
+
sep_line = "| #{(0..n_cols-1).map { |j| "-" * col_widths[j] }.join(" | ")} |"
|
251
|
+
content_width = sep_line.length - 4
|
252
|
+
top_line = "| #{"-" * content_width} |"
|
253
|
+
|
254
|
+
format_row = Proc.new do |row|
|
255
|
+
cells = row.zip(col_widths).map do |cell, width|
|
256
|
+
"%-#{width}s" % cell
|
257
|
+
end
|
258
|
+
"| %s |" % cells.join(" | ")
|
259
|
+
end
|
260
|
+
|
261
|
+
puts top_line
|
262
|
+
puts format_row.call(header)
|
263
|
+
puts sep_line
|
264
|
+
rows.each do |row|
|
265
|
+
puts format_row.call(row)
|
266
|
+
end
|
267
|
+
puts top_line
|
268
|
+
if legend
|
269
|
+
legend.each do |line|
|
270
|
+
puts "| %-#{content_width}s |" % line
|
271
|
+
end
|
272
|
+
puts top_line
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def self.to_vocabulary_index_termfreq(vocab, tokens)
|
277
|
+
counts = Hash.new(0)
|
278
|
+
tokens.each do |key|
|
279
|
+
idx = vocab[key]
|
280
|
+
counts[idx] += 1
|
281
|
+
end
|
282
|
+
counts
|
283
|
+
end
|
284
|
+
|
285
|
+
def self.to_vocabulary_index_termfreq_gaps(vocab, tokens)
|
286
|
+
counts = Hash.new(0)
|
287
|
+
tokens.each do |key|
|
288
|
+
if vocab.key? key
|
289
|
+
idx = vocab[key]
|
290
|
+
counts[idx] += 1
|
291
|
+
end
|
292
|
+
end
|
293
|
+
counts
|
294
|
+
end
|
295
|
+
|
296
|
+
def self.l2_norm(vec)
|
297
|
+
norm = vec.values.inject(0.0) { |sum, x| sum + x**2 }
|
298
|
+
Math.sqrt(norm)
|
299
|
+
end
|
300
|
+
|
301
|
+
def self.l2_normalize!(vec)
|
302
|
+
norm = l2_norm(vec)
|
303
|
+
vec.transform_values! do |value|
|
304
|
+
value.to_f / norm
|
305
|
+
end
|
306
|
+
nil
|
307
|
+
end
|
308
|
+
|
309
|
+
def self.similarity(a, b)
|
310
|
+
sum = 0.0
|
311
|
+
a.each_key do |idx|
|
312
|
+
if b.key? idx
|
313
|
+
sum += a[idx] * b[idx]
|
199
314
|
end
|
200
|
-
|
315
|
+
end
|
316
|
+
sum
|
201
317
|
end
|
318
|
+
|
319
|
+
# Filter vocabulary by minimum document frequency.
|
320
|
+
def self.filter_vocab_by_freq!(db, min_freq)
|
321
|
+
vocabulary = db['vocabulary']
|
322
|
+
|
323
|
+
# Get document frequencies
|
324
|
+
docfreq = Array.new(vocabulary.size, 0)
|
325
|
+
db['samples'].each_value do |samples|
|
326
|
+
samples.each do |sample|
|
327
|
+
sample.each_key do |idx|
|
328
|
+
docfreq[idx] += 1
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
vocabulary.select! do |_, idx|
|
334
|
+
docfreq[idx] >= min_freq
|
335
|
+
end
|
336
|
+
|
337
|
+
nil
|
338
|
+
end
|
339
|
+
|
340
|
+
# Sort vocabulary lexicographically.
|
341
|
+
def self.sort_vocab!(db)
|
342
|
+
new_indices = Hash.new { |h,k| h[k] = h.length }
|
343
|
+
db['vocabulary'].sort_by { |x| x[0] }.each do |term, idx|
|
344
|
+
db['vocabulary'][term] = new_indices[idx]
|
345
|
+
end
|
346
|
+
new_indices.default_proc = nil
|
347
|
+
|
348
|
+
db['samples'].transform_values! do |samples|
|
349
|
+
samples.map do |sample|
|
350
|
+
new_sample = {}
|
351
|
+
sample.each do |idx, freq|
|
352
|
+
new_idx = new_indices[idx]
|
353
|
+
if not new_idx.nil?
|
354
|
+
new_sample[new_idx] = freq
|
355
|
+
end
|
356
|
+
end
|
357
|
+
new_sample
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
# Compute inverse class frequency (ICF) for every term.
|
363
|
+
def self.inverse_class_freqs(db)
|
364
|
+
icf = Array.new(db['vocabulary'].size, 0)
|
365
|
+
db['samples'].each_value do |samples|
|
366
|
+
terms = Set.new
|
367
|
+
samples.each do |sample|
|
368
|
+
terms |= sample.keys
|
369
|
+
end
|
370
|
+
terms.each do |idx|
|
371
|
+
icf[idx] += 1
|
372
|
+
end
|
373
|
+
end
|
374
|
+
icf.map! do |val|
|
375
|
+
Math.log(db['samples'].size.to_f / val.to_f) + 1
|
376
|
+
end
|
377
|
+
icf
|
378
|
+
end
|
379
|
+
|
380
|
+
def self.normalize_samples!(db)
|
381
|
+
icf = db['icf']
|
382
|
+
db['samples'].each_value do |samples|
|
383
|
+
samples.each do |sample|
|
384
|
+
sample.each do |idx, freq|
|
385
|
+
tf = 1.0 + Math.log(freq)
|
386
|
+
sample[idx] = tf * icf[idx]
|
387
|
+
end
|
388
|
+
l2_normalize! sample
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
def self.get_centroids(db)
|
394
|
+
centroids = {}
|
395
|
+
db['samples'].each do |language, samples|
|
396
|
+
centroid = Hash.new(0.0)
|
397
|
+
samples.each do |sample|
|
398
|
+
sample.each do |idx, val|
|
399
|
+
centroid[idx] += val
|
400
|
+
end
|
401
|
+
end
|
402
|
+
centroid.each_key do |idx|
|
403
|
+
centroid[idx] = centroid[idx] / samples.length
|
404
|
+
end
|
405
|
+
l2_normalize! centroid
|
406
|
+
centroids[language] = centroid
|
407
|
+
end
|
408
|
+
centroids
|
409
|
+
end
|
410
|
+
|
202
411
|
end
|
203
412
|
end
|
data/lib/linguist/generated.rb
CHANGED
@@ -61,6 +61,7 @@ module Linguist
|
|
61
61
|
composer_lock? ||
|
62
62
|
cargo_lock? ||
|
63
63
|
cargo_orig? ||
|
64
|
+
deno_lock? ||
|
64
65
|
flake_lock? ||
|
65
66
|
bazel_lock? ||
|
66
67
|
node_modules? ||
|
@@ -68,6 +69,7 @@ module Linguist
|
|
68
69
|
go_lock? ||
|
69
70
|
poetry_lock? ||
|
70
71
|
pdm_lock? ||
|
72
|
+
uv_lock? ||
|
71
73
|
esy_lock? ||
|
72
74
|
npm_shrinkwrap_or_package_lock? ||
|
73
75
|
pnpm_lock? ||
|
@@ -422,6 +424,13 @@ module Linguist
|
|
422
424
|
!!name.match(/pdm\.lock/)
|
423
425
|
end
|
424
426
|
|
427
|
+
# Internal: Is the blob a generated uv.lock?
|
428
|
+
#
|
429
|
+
# Returns true or false.
|
430
|
+
def uv_lock?
|
431
|
+
!!name.match(/uv\.lock/)
|
432
|
+
end
|
433
|
+
|
425
434
|
# Internal: Is the blob a generated esy lock file?
|
426
435
|
#
|
427
436
|
# Returns true or false.
|
@@ -429,6 +438,13 @@ module Linguist
|
|
429
438
|
!!name.match(/(^|\/)(\w+\.)?esy.lock$/)
|
430
439
|
end
|
431
440
|
|
441
|
+
# Internal: Is the blob a generated deno lockfile, which are not meant for humans in pull requests.
|
442
|
+
#
|
443
|
+
# Returns true or false.
|
444
|
+
def deno_lock?
|
445
|
+
!!name.match(/deno\.lock/)
|
446
|
+
end
|
447
|
+
|
432
448
|
# Internal: Is the blob a generated npm shrinkwrap or package lock file?
|
433
449
|
#
|
434
450
|
# Returns true or false.
|
@@ -697,14 +713,11 @@ module Linguist
|
|
697
713
|
|
698
714
|
# Internal: Is this a generated Game Maker Studio (2) metadata file?
|
699
715
|
#
|
700
|
-
# All Game Maker Studio 2 generated files will be JSON, .yy or .yyp, and have
|
701
|
-
# a part that looks like "modelName: GMname" on the 3rd line
|
702
|
-
#
|
703
716
|
# Return true or false
|
704
717
|
def generated_gamemakerstudio?
|
705
718
|
return false unless ['.yy', '.yyp'].include? extname
|
706
719
|
return false unless lines.count > 3
|
707
|
-
return lines
|
720
|
+
return lines.first(3).join('').match?(/^\s*[\{\[]/) ||
|
708
721
|
lines[0] =~ /^\d\.\d\.\d.+\|\{/
|
709
722
|
end
|
710
723
|
|
data/lib/linguist/generic.yml
CHANGED
data/lib/linguist/heuristics.rb
CHANGED
@@ -126,7 +126,7 @@ module Linguist
|
|
126
126
|
# Internal: Perform the heuristic
|
127
127
|
def call(data)
|
128
128
|
matched = @rules.find do |rule|
|
129
|
-
rule['pattern'].match(data)
|
129
|
+
rule['pattern'].match?(data)
|
130
130
|
end
|
131
131
|
if !matched.nil?
|
132
132
|
languages = matched['language']
|
@@ -145,14 +145,14 @@ module Linguist
|
|
145
145
|
@pats = pats
|
146
146
|
end
|
147
147
|
|
148
|
-
def match(input)
|
149
|
-
return
|
148
|
+
def match?(input)
|
149
|
+
return @pats.all? { |pat| pat.match?(input) }
|
150
150
|
end
|
151
151
|
|
152
152
|
end
|
153
153
|
|
154
154
|
class AlwaysMatch
|
155
|
-
def match(input)
|
155
|
+
def match?(input)
|
156
156
|
return true
|
157
157
|
end
|
158
158
|
end
|
@@ -163,8 +163,8 @@ module Linguist
|
|
163
163
|
@pat = pat
|
164
164
|
end
|
165
165
|
|
166
|
-
def match(input)
|
167
|
-
return !@pat.match(input)
|
166
|
+
def match?(input)
|
167
|
+
return !@pat.match?(input)
|
168
168
|
end
|
169
169
|
|
170
170
|
end
|