RubyGems - github-linguist - Versions diffs - 7.29.0 → 8.0.0 - Mend

github-linguist 7.29.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

checksums.yaml +4 -4
data/bin/git-linguist +7 -3
data/bin/github-linguist +6 -3
data/grammars/etc.json +1 -1
data/grammars/inline.edgeql.json +1 -1
data/grammars/inline.graphql.json +1 -1
data/grammars/inline.peggy.json +1 -0
data/grammars/markdown.gleam.codeblock.json +1 -0
data/grammars/markdown.move.codeblock.json +1 -1
data/grammars/markdown.rego.codeblock.json +1 -0
data/grammars/mdx.move.codeblock.json +1 -0
data/grammars/source.8xp.json +1 -1
data/grammars/source.Caddyfile-test.json +1 -0
data/grammars/source.Caddyfile.json +1 -0
data/grammars/source.abl.json +1 -1
data/grammars/source.astro.json +1 -1
data/grammars/source.bicep.json +1 -1
data/grammars/source.bms.json +1 -1
data/grammars/source.bqn.json +1 -0
data/grammars/source.brs.json +1 -1
data/grammars/source.cairo.json +1 -1
data/grammars/source.cairo0.json +1 -0
data/grammars/source.cl.json +1 -1
data/grammars/source.clar.json +1 -1
data/grammars/source.cmd.json +1 -1
data/grammars/source.cobol.json +1 -1
data/grammars/source.commonlisp.json +1 -1
data/grammars/source.cs.json +1 -1
data/grammars/source.curlrc.json +1 -1
data/grammars/source.curry.json +1 -1
data/grammars/source.cylc.json +1 -0
data/grammars/source.d2.json +1 -1
data/grammars/source.dart.json +1 -1
data/grammars/source.dds.dspf.json +1 -1
data/grammars/source.dds.icff.json +1 -1
data/grammars/source.dds.lf.json +1 -1
data/grammars/source.dds.pf.json +1 -1
data/grammars/source.dds.prtf.json +1 -1
data/grammars/source.dune.json +1 -0
data/grammars/source.elvish.json +1 -1
data/grammars/source.firrtl.json +1 -0
data/grammars/source.fsharp.json +1 -1
data/grammars/source.gdscript.json +1 -1
data/grammars/source.generic-db.json +1 -1
data/grammars/source.gitconfig.json +1 -1
data/grammars/source.gjs.json +1 -1
data/grammars/source.gleam.json +1 -1
data/grammars/source.gts.json +1 -1
data/grammars/source.hcl.json +1 -1
data/grammars/source.hcl.terraform.json +1 -1
data/grammars/source.hgignore.json +1 -1
data/grammars/source.hosts.json +1 -1
data/grammars/source.hx.json +1 -1
data/grammars/source.iCalendar.json +1 -0
data/grammars/source.ice.json +1 -1
data/grammars/source.julia.json +1 -1
data/grammars/source.just.json +1 -1
data/grammars/source.kotlin.json +1 -1
data/grammars/source.lcb.json +1 -0
data/grammars/source.lilypond.json +1 -1
data/grammars/source.livecodescript.json +1 -0
data/grammars/source.lua.json +1 -1
data/grammars/source.luau.json +1 -0
data/grammars/source.m2.json +1 -1
data/grammars/source.markdown.caddy.codeblock.json +1 -0
data/grammars/source.matlab.json +1 -1
data/grammars/source.mcfunction.json +1 -1
data/grammars/source.mdx.json +1 -1
data/grammars/source.mo.json +1 -1
data/grammars/source.mojo.json +1 -1
data/grammars/source.move.json +1 -1
data/grammars/source.nanorc.json +1 -1
data/grammars/source.nim.json +1 -1
data/grammars/source.nr.json +1 -0
data/grammars/source.nushell.json +1 -1
data/grammars/source.odin.json +1 -1
data/grammars/source.p4.json +1 -1
data/grammars/source.peggy.json +1 -0
data/grammars/source.pkl.json +1 -0
data/grammars/source.polar.json +1 -1
data/grammars/source.powerbuilder.json +1 -0
data/grammars/source.qsharp.json +1 -1
data/grammars/source.rascal.json +1 -1
data/grammars/source.rego.json +1 -1
data/grammars/source.rescript.json +1 -1
data/grammars/source.ron.json +1 -0
data/grammars/source.rpgle.json +1 -1
data/grammars/source.rust.json +1 -1
data/grammars/source.sentinel.json +1 -1
data/grammars/source.solidity.json +1 -1
data/grammars/source.sourcepawn.json +1 -1
data/grammars/source.sqf.json +1 -1
data/grammars/source.stan.json +1 -1
data/grammars/source.swift.json +1 -1
data/grammars/source.sy.json +1 -1
data/grammars/source.templ.json +1 -0
data/grammars/source.vba.json +1 -1
data/grammars/source.vcard.json +1 -0
data/grammars/source.wdl.json +1 -1
data/grammars/source.wsd.json +1 -1
data/grammars/text.adblock.json +1 -1
data/grammars/text.crontab.json +1 -0
data/grammars/text.html.jte.json +1 -0
data/grammars/text.html.statamic.json +1 -1
data/grammars/text.md.json +1 -1
data/grammars/text.mdx.astro.codeblock.json +1 -0
data/grammars/text.valve-cfg.json +1 -1
data/grammars/version +1 -1
data/lib/linguist/VERSION +1 -1
data/lib/linguist/classifier.rb +315 -106
data/lib/linguist/generated.rb +33 -4
data/lib/linguist/generic.yml +1 -0
data/lib/linguist/heuristics.rb +6 -6
data/lib/linguist/heuristics.yml +63 -4
data/lib/linguist/languages.json +1 -1
data/lib/linguist/languages.yml +224 -4
data/lib/linguist/repository.rb +8 -6
data/lib/linguist/samples.json +1 -1
data/lib/linguist/samples.rb +9 -1
data/lib/linguist/sha256.rb +1 -1
metadata +29 -7
data/grammars/inline.graphql.rb.json +0 -1
data/grammars/markdown.mcfunction.codeblock.json +0 -1
data/grammars/mdx.LANGUAGE.codeblock.json +0 -1
data/grammars/source.terraform.json +0 -1

data/grammars/text.mdx.astro.codeblock.json ADDED Viewed

@@ -0,0 +1 @@

+ {"scopeName":"text.mdx.astro.codeblock","patterns":[{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(`{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r`])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}},{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(~{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}}],"repository":{"astro-code-block":{"patterns":[{"contentName":"meta.embedded.block.astro.frontmatter","begin":"^\\s*---\\s*$","end":"^\\s*---\\s*$","patterns":[{"include":"source.tsx"}],"beginCaptures":{"0":{"name":"punctuation.definition.tag.xi.begin.t"}},"endCaptures":{"0":{"name":"punctuation.definition.tag.xi.end.t"}}},{"include":"source.astro"}]}}}

data/grammars/text.valve-cfg.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"scopeName":"text.valve-cfg","patterns":[{"include":"#comments"},{"include":"#function"},{"include":"#cvar"},{"include":"#value"}],"repository":{"comments":{"patterns":[{"name":"comment.valve-cfg","match":"\\/\\/."},{"name":"comment.block.valve-cfg","begin":"/\\","end":"\\/","captures":{"0":{"name":"comment.valve-cfg"}}}]},"cvar":{"patterns":[{"match":"^\\s(\"[A-Za-z0-9_-]\")","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s('[A-Za-z0-9_-]')","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s([A-Za-z0-9_-])","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}}]},"function":{"patterns":[{"name":"support.function.valve-cfg","match":"^\\s\\b(alias\|bind_osx\|bind\|clear\|echo\|execifexists\|execwithwhitelist\|exec\|host_writeconfig_ss\|host_writeconfig\|key_updatelayout\|playvol\|say_team\|say\|unbindalljoystick\|unbindallmousekeyboard\|unbindall)","captures":{"1":{"name":"support.function.valve-cfg"}}}]},"numeric-literal":{"patterns":[{"name":"constant.numeric.float.~~sourcepawn~~","match":"[0-9]+\\.[0-9]+"},{"name":"constant.numeric.~~sourcepawn~~","match":"\\b0b[0-1]+\\b"},{"name":"constant.numeric.~~sourcepawn~~","match":"\\b0o[0-7]+\\b"},{"name":"constant.numeric.~~sourcepawn~~","match":"\\b0x[0-9a-fA-F]+\\b"},{"name":"constant.numeric.integer.~~sourcepawn~~","match":"\\b\\d+\\b"},{"name":"invalid.illegal.constant.~~sourcepawn~~","match":"\\b\\d+\\w+\\b"}]},"strings":{"name":"string.quoted.double.valve-cfg","begin":"\"","end":"\"","patterns":[{"name":"variable","match":"\\\\."}]},"value":{"begin":".","end":"\\n","patterns":[{"include":"#numeric-literal"},{"include":"#strings"}]}}}
1	+ {"scopeName":"text.valve-cfg","patterns":[{"include":"#comments"},{"include":"#function"},{"include":"#cvar"},{"include":"#value"}],"repository":{"comments":{"patterns":[{"name":"comment.valve-cfg","match":"\\/\\/."},{"name":"comment.block.valve-cfg","begin":"/\\","end":"\\/","captures":{"0":{"name":"comment.valve-cfg"}}}]},"cvar":{"patterns":[{"match":"^\\s(\"[A-Za-z0-9_-]\")","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s('[A-Za-z0-9_-]')","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s([A-Za-z0-9_-])","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}}]},"function":{"patterns":[{"name":"support.function.valve-cfg","match":"^\\s\\b(alias\|bind_osx\|bind\|clear\|echo\|execifexists\|execwithwhitelist\|exec\|host_writeconfig_ss\|host_writeconfig\|key_updatelayout\|playvol\|say_team\|say\|unbindalljoystick\|unbindallmousekeyboard\|unbindall)","captures":{"1":{"name":"support.function.valve-cfg"}}}]},"numeric-literal":{"patterns":[{"name":"constant.numeric.float.valve-cfg","match":"[0-9]+\\.[0-9]+"},{"name":"constant.numeric.valve-cfg","match":"\\b0b[0-1]+\\b"},{"name":"constant.numeric.valve-cfg","match":"\\b0o[0-7]+\\b"},{"name":"constant.numeric.valve-cfg","match":"\\b0x[0-9a-fA-F]+\\b"},{"name":"constant.numeric.integer.valve-cfg","match":"\\b\\d+\\b"},{"name":"invalid.illegal.constant.valve-cfg","match":"\\b\\d+\\w+\\b"}]},"strings":{"name":"string.quoted.double.valve-cfg","begin":"\"","end":"\"","patterns":[{"name":"variable","match":"\\\\."}]},"value":{"begin":".","end":"\\n","patterns":[{"include":"#numeric-literal"},{"include":"#strings"}]}}}

data/grammars/version CHANGED Viewed

	@@ -1 +1 @@
1	- 7.29.0
1	+ 8.0.0

data/lib/linguist/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 7.29.0
1	+ 8.0.0

data/lib/linguist/classifier.rb CHANGED Viewed

@@ -1,8 +1,12 @@
 require 'linguist/tokenizer'
+require 'set'
 module Linguist
-  # Language bayesian classifier.
+  # Language content classifier.
   class Classifier
+    # Maximum number of bytes to consider for classification.
+    # This is only used at evaluation time. During training, full content of
+    # samples is used.
     CLASSIFIER_CONSIDER_BYTES = 50 * 1024
     # Public: Use the classifier to detect language of the blob.
@@ -28,41 +32,59 @@ module Linguist
     #
     # db       - Hash classifier database object
     # language - String language of data
-    # data     - String contents of file
+    # data     - String contents of file or array of tokens.
     #
     # Examples
     #
-    #   Classifier.train(db, 'Ruby', "def hello; end")
+    #   Classifier.train!(db, 'Ruby', "def hello; end")
     #
-    # Returns nothing.
+    # Returns nil.
     #
-    # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
-    # per-language.  See also #dump_all_tokens, below.
+    # Set LINGUIST_DEBUG=1, =2 or =3 to print internal statistics.
     def self.train!(db, language, data)
       tokens = data
       tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
-      counts = Hash.new(0)
-      tokens.each { |tok| counts[tok] += 1 }
+      db['vocabulary'] ||= {}
+      # Set hash to autoincremented index value
+      if db['vocabulary'].default_proc.nil?
+        db['vocabulary'].default_proc = proc do |hash, key|
+          hash[key] = hash.length
+        end
+      end
-      db['tokens_total'] ||= 0
-      db['languages_total'] ||= 0
-      db['tokens'] ||= {}
-      db['language_tokens'] ||= {}
-      db['languages'] ||= {}
+      db['samples'] ||= {}
+      db['samples'][language] ||= []
-      counts.each do |token, count|
-        db['tokens'][language] ||= {}
-        db['tokens'][language][token] ||= 0
-        db['tokens'][language][token] += count
-        db['language_tokens'][language] ||= 0
-        db['language_tokens'][language] += count
-        db['tokens_total'] += count
-      end
-      db['languages'][language] ||= 0
-      db['languages'][language] += 1
-      db['languages_total'] += 1
+      termfreq = to_vocabulary_index_termfreq(db['vocabulary'], tokens)
+      db['samples'][language] << termfreq
+      nil
+    end
+    # Public: Finalize training.
+    #
+    # db - Hash classifier database object
+    #
+    # Examples:
+    #   Classifier.finalize_train!(db)
+    #
+    # Returns nil.
+    #
+    # This method must be called after the last #train! call.
+    def self.finalize_train!(db)
+      db['vocabulary'] ||= {}
+      # Unset hash autoincrement
+      db['vocabulary'].default_proc = nil
+      db['samples'] ||= []
+      filter_vocab_by_freq! db, MIN_DOCUMENT_FREQUENCY
+      sort_vocab! db
+      db['icf'] = inverse_class_freqs db
+      normalize_samples! db
+      db['centroids'] = get_centroids db
+      db.delete 'samples'
       nil
     end
@@ -78,20 +100,17 @@ module Linguist
     #   # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
     #
     # Returns sorted Array of result pairs. Each pair contains the
-    # String language name and a Float score.
+    # String language name and a Float score between 0.0 and 1.0.
     def self.classify(db, tokens, languages = nil)
-      languages ||= db['languages'].keys
+      languages ||= db['centroids'].keys
       new(db).classify(tokens, languages)
     end
     # Internal: Initialize a Classifier.
     def initialize(db = {})
-      @tokens_total    = db['tokens_total']
-      @languages_total = db['languages_total']
-      @tokens          = db['tokens']
-      @language_tokens = db['language_tokens']
-      @languages       = db['languages']
-      @unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
+      @vocabulary = db['vocabulary']
+      @centroids  = db['centroids']
+      @icf = db['icf']
     end
     # Internal: Guess language of data
@@ -100,72 +119,70 @@ module Linguist
     # languages - Array of language name Strings to restrict to.
     #
     # Returns sorted Array of result pairs. Each pair contains the
-    # String language name and a Float score.
+    # String language name and a Float score between 0.0 and 1.0.
     def classify(tokens, languages)
       return [] if tokens.nil? || languages.empty?
       tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
-      scores = {}
-      debug_dump_all_tokens(tokens, languages) if verbosity >= 2
+      debug_dump_tokens(tokens) if verbosity >= 3
-      counts = Hash.new(0)
-      tokens.each { |tok| counts[tok] += 1 }
+      vec = Classifier.to_vocabulary_index_termfreq_gaps(@vocabulary, tokens)
+      vec.each do |idx, freq|
+        tf = 1.0 + Math.log(freq)
+        vec[idx] = tf * @icf[idx]
+      end
+      return [] if vec.empty?
+      Classifier.l2_normalize!(vec)
+      scores = {}
       languages.each do |language|
-        scores[language] = tokens_probability(counts, language) + language_probability(language)
-        debug_dump_probabilities(counts, language, scores[language]) if verbosity >= 1
+        centroid = @centroids[language]
+        score = Classifier.similarity(vec, centroid)
+        if score > 0.0
+          scores[language] = score
+        end
       end
-      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
+      scores = scores.sort_by { |x| -x[1] }
+      debug_dump_all_tokens(tokens, scores) if verbosity >= 2
+      debug_dump_scores(scores) if verbosity >= 1
+      scores
     end
-    # Internal: Probably of set of tokens in a language occurring - P(D | C)
-    #
-    # tokens   - Array of String tokens.
-    # language - Language to check.
-    #
-    # Returns Float between 0.0 and 1.0.
-    def tokens_probability(counts, language)
-      sum = 0
-      counts.each do |token, count|
-        sum += count * token_probability(token, language)
+    private
+      MIN_DOCUMENT_FREQUENCY = 2
+      def verbosity
+        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
       end
-      sum
-    end
-    # Internal: Log-probability of token in language occurring - P(F | C)
-    #
-    # token    - String token.
-    # language - Language to check.
-    #
-    # Returns Float.
-    def token_probability(token, language)
-      count = @tokens[language][token]
-      if count.nil? || count == 0
-        # This is usually the most common case, so we cache the result.
-        @unknown_logprob
-      else
-        Math.log(count.to_f / @language_tokens[language].to_f)
+      def debug_dump_scores(scores)
+        headers = ["Language", "Score"]
+        rows = scores.map { |l, s| [l, "%.3f" % s] }
+        dump_table(headers, rows)
       end
-    end
-    # Internal: Probably of a language occurring - P(C)
-    #
-    # language - Language to check.
-    #
-    # Returns Float between 0.0 and 1.0.
-    def language_probability(language)
-      Math.log(@languages[language].to_f / @languages_total.to_f)
-    end
+      def debug_dump_tokens(tokens)
+        counts = Hash.new(0)
+        tokens.each do |tok|
+          idx = @vocabulary[tok]
+          if not idx.nil?
+            counts[tok] += 1
+          end
+        end
-    private
-      def verbosity
-        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
-      end
+        norm = Classifier.l2_norm(counts)
+        rows = counts.map do |tok, tf|
+          idx = @vocabulary[tok]
+          log_tf = 1.0 + Math.log(tf)
+          with_icf = log_tf * @icf[idx]
+          normalized = with_icf / norm
+          row = [tok, tf, "%.3f" % log_tf, "%.3f" % with_icf, "%.3f" % normalized]
+          [normalized, row]
+        end
-      def debug_dump_probabilities(tokens, language, score)
-        printf("%10s = %10.3f + %7.3f = %10.3f\n",
-            language, tokens_probability(tokens, language), language_probability(language), score)
+        headers = ["Token", "TF", "log", "*ICF", "L2"]
+        rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
+        dump_table(headers, rows)
       end
       # Internal: show a table of probabilities for each <token,language> pair.
@@ -173,31 +190,223 @@ module Linguist
       # The number in each table entry is the number of "points" that each
       # token contributes toward the belief that the file under test is a
       # particular language.  Points are additive.
-      #
-      # Points are the number of times a token appears in the file, times
-      # how much more likely (log of probability ratio) that token is to
-      # appear in one language vs. the least-likely language.  Dashes
-      # indicate the least-likely language (and zero points) for each token.
-      def debug_dump_all_tokens(tokens, languages)
-        maxlen = tokens.map { |tok| tok.size }.max
-        printf "%#{maxlen}s", ""
-        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
-        token_map = Hash.new(0)
-        tokens.each { |tok| token_map[tok] += 1 }
-        token_map.sort.each { |tok, count|
-          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
-          min = arr.map { |a,b| b }.min
-          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
-            printf "%#{maxlen}s%5d", tok, count
-            puts arr.map { |ent|
-              ent[1] == min ? "         -" : sprintf("%10.3f", count * (ent[1] - min))
-            }.join
+      def debug_dump_all_tokens(tokens, scores)
+        languages = scores.map { |l, _| l }
+        counts = Hash.new(0)
+        tokens.each do |tok|
+          idx = @vocabulary[tok]
+          if not idx.nil?
+            counts[tok] += 1
+          end
+        end
+        data = {}
+        norm = Classifier.l2_norm(counts)
+        languages.each do |language|
+          data[language] = {}
+          counts.each do |tok, tf|
+            idx = @vocabulary[tok]
+            log_tf = 1.0 + Math.log(tf)
+            with_icf = log_tf * @icf[idx]
+            normalized = with_icf / norm
+            data[language][tok] = normalized * @centroids[language][idx].to_f
+          end
+        end
+        norm = Classifier.l2_norm(counts)
+        rows = counts.map do |tok, tf|
+          idx = @vocabulary[tok]
+          log_tf = 1.0 + Math.log(tf)
+          with_icf = log_tf * @icf[idx]
+          normalized = with_icf / norm
+          scores = languages.map do |l, _|
+            [l, data[l][tok].to_f]
+          end
+          max_score = scores.to_h.values.max
+          row = [tok] + scores.map do |l, s|
+            if s == max_score
+              "%.4f*" % s
+            elsif s > 0.0
+              "%.4f" % s
+            else
+              "-"
+            end
+          end
+          [normalized, row]
+        end
+        headers = ["Token"] + (0..languages.length-1).map { |lidx| "[#{lidx}]" }
+        rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
+        legend = languages.each_with_index.map { |l, lidx| "[#{lidx}] = #{l}" }
+        dump_table(headers, rows, legend)
+      end
+      def dump_table(header, rows, legend = nil)
+        n_cols = header.length
+        rows = rows.map { |r| r.map { |c| c.to_s } }
+        col_widths = (0..n_cols - 1).map do |j|
+          ([header[j].length] + rows.map { |row| row[j].length }).max
+        end
+        sep_line = "| #{(0..n_cols-1).map { |j| "-" * col_widths[j] }.join(" | ")} |"
+        content_width = sep_line.length - 4
+        top_line = "| #{"-" * content_width} |"
+        format_row = Proc.new do |row|
+          cells = row.zip(col_widths).map do |cell, width|
+            "%-#{width}s" % cell
+          end
+          "| %s |" % cells.join(" | ")
+        end
+        puts top_line
+        puts format_row.call(header)
+        puts sep_line
+        rows.each do |row|
+          puts format_row.call(row)
+        end
+        puts top_line
+        if legend
+          legend.each do |line|
+            puts "| %-#{content_width}s |" % line
+          end
+          puts top_line
+        end
+      end
+      def self.to_vocabulary_index_termfreq(vocab, tokens)
+        counts = Hash.new(0)
+        tokens.each do |key|
+          idx = vocab[key]
+          counts[idx] += 1
+        end
+        counts
+      end
+      def self.to_vocabulary_index_termfreq_gaps(vocab, tokens)
+        counts = Hash.new(0)
+        tokens.each do |key|
+          if vocab.key? key
+            idx = vocab[key]
+            counts[idx] += 1
+          end
+        end
+        counts
+      end
+      def self.l2_norm(vec)
+        norm = vec.values.inject(0.0) { |sum, x| sum + x**2 }
+        Math.sqrt(norm)
+      end
+      def self.l2_normalize!(vec)
+        norm = l2_norm(vec)
+        vec.transform_values! do |value|
+          value.to_f / norm
+        end
+        nil
+      end
+      def self.similarity(a, b)
+        sum = 0.0
+        a.each_key do |idx|
+          if b.key? idx
+            sum += a[idx] * b[idx]
           end
-        }
+        end
+        sum
       end
+    # Filter vocabulary by minimum document frequency.
+    def self.filter_vocab_by_freq!(db, min_freq)
+      vocabulary = db['vocabulary']
+      # Get document frequencies
+      docfreq = Array.new(vocabulary.size, 0)
+      db['samples'].each_value do |samples|
+        samples.each do |sample|
+          sample.each_key do |idx|
+            docfreq[idx] += 1
+          end
+        end
+      end
+      vocabulary.select! do |_, idx|
+        docfreq[idx] >= min_freq
+      end
+      nil
+    end
+    # Sort vocabulary lexicographically.
+    def self.sort_vocab!(db)
+      new_indices = Hash.new { |h,k| h[k] = h.length }
+      db['vocabulary'].sort_by { |x| x[0] }.each do |term, idx|
+        db['vocabulary'][term] = new_indices[idx]
+      end
+      new_indices.default_proc = nil
+      db['samples'].transform_values! do |samples|
+        samples.map do |sample|
+          new_sample = {}
+          sample.each do |idx, freq|
+            new_idx = new_indices[idx]
+            if not new_idx.nil?
+              new_sample[new_idx] = freq
+            end
+          end
+          new_sample
+        end
+      end
+    end
+    # Compute inverse class frequency (ICF) for every term.
+    def self.inverse_class_freqs(db)
+      icf = Array.new(db['vocabulary'].size, 0)
+      db['samples'].each_value do |samples|
+        terms = Set.new
+        samples.each do |sample|
+          terms |= sample.keys
+        end
+        terms.each do |idx|
+          icf[idx] += 1
+        end
+      end
+      icf.map! do |val|
+        Math.log(db['samples'].size.to_f / val.to_f) + 1
+      end
+      icf
+    end
+    def self.normalize_samples!(db)
+      icf = db['icf']
+      db['samples'].each_value do |samples|
+        samples.each do |sample|
+          sample.each do |idx, freq|
+            tf = 1.0 + Math.log(freq)
+            sample[idx] = tf * icf[idx]
+          end
+          l2_normalize! sample
+        end
+      end
+    end
+    def self.get_centroids(db)
+      centroids = {}
+      db['samples'].each do |language, samples|
+        centroid = Hash.new(0.0)
+        samples.each do |sample|
+          sample.each do |idx, val|
+            centroid[idx] += val
+          end
+        end
+        centroid.each_key do |idx|
+          centroid[idx] = centroid[idx] / samples.length
+        end
+        l2_normalize! centroid
+        centroids[language] = centroid
+      end
+      centroids
+    end
   end
 end

data/lib/linguist/generated.rb CHANGED Viewed

@@ -60,12 +60,16 @@ module Linguist
       generated_net_specflow_feature_file? ||
       composer_lock? ||
       cargo_lock? ||
+      cargo_orig? ||
+      deno_lock? ||
       flake_lock? ||
+      bazel_lock? ||
       node_modules? ||
       go_vendor? ||
       go_lock? ||
       poetry_lock? ||
       pdm_lock? ||
+      uv_lock? ||
       esy_lock? ||
       npm_shrinkwrap_or_package_lock? ||
       pnpm_lock? ||
@@ -420,6 +424,13 @@ module Linguist
       !!name.match(/pdm\.lock/)
     end
+    # Internal: Is the blob a generated uv.lock?
+    #
+    # Returns true or false.
+    def uv_lock?
+      !!name.match(/uv\.lock/)
+    end
     # Internal: Is the blob a generated esy lock file?
     #
     # Returns true or false.
@@ -427,6 +438,13 @@ module Linguist
       !!name.match(/(^|\/)(\w+\.)?esy.lock$/)
     end
+    # Internal: Is the blob a generated deno lockfile, which are not meant for humans in pull requests.
+    #
+    # Returns true or false.
+    def deno_lock?
+      !!name.match(/deno\.lock/)
+    end
     # Internal: Is the blob a generated npm shrinkwrap or package lock file?
     #
     # Returns true or false.
@@ -477,6 +495,13 @@ module Linguist
       !!name.match(/Cargo\.lock/)
     end
+    # Internal: Is the blob a generated Rust Cargo original file?
+    #
+    # Returns true or false.
+    def cargo_orig?
+      !!name.match(/Cargo\.toml\.orig/)
+    end
     # Internal: Is the blob a generated Nix flakes lock file?
     #
     # Returns true or false
@@ -484,6 +509,13 @@ module Linguist
       !!name.match(/(^|\/)flake\.lock$/)
     end
+    # Internal: Is the blob a Bazel generated bzlmod lockfile?
+    #
+    # Returns true or false
+    def bazel_lock?
+      !!name.match(/(^|\/)MODULE\.bazel\.lock$/)
+    end
     # Is the blob a VCR Cassette file?
     #
     # Returns true or false
@@ -681,14 +713,11 @@ module Linguist
     # Internal: Is this a generated Game Maker Studio (2) metadata file?
     #
-    # All Game Maker Studio 2 generated files will be JSON, .yy or .yyp, and have
-    # a part that looks like "modelName: GMname" on the 3rd line
-    #
     # Return true or false
     def generated_gamemakerstudio?
       return false unless ['.yy', '.yyp'].include? extname
       return false unless lines.count > 3
-      return lines[2].match(/\"modelName\"\:\s*\"GM/) ||
+      return lines.first(3).join('').match?(/^\s*[\{\[]/) ||
              lines[0] =~ /^\d\.\d\.\d.+\|\{/
     end

data/lib/linguist/generic.yml CHANGED Viewed

@@ -16,6 +16,7 @@ extensions:
 - ".9"
 - ".app"
 - ".cmp"
+- ".resource"
 - ".sol"
 - ".stl"
 - ".tag"

data/lib/linguist/heuristics.rb CHANGED Viewed

@@ -126,7 +126,7 @@ module Linguist
     # Internal: Perform the heuristic
     def call(data)
       matched = @rules.find do |rule|
-        rule['pattern'].match(data)
+        rule['pattern'].match?(data)
       end
       if !matched.nil?
         languages = matched['language']
@@ -145,14 +145,14 @@ module Linguist
       @pats = pats
     end
-    def match(input)
-      return !@pats.any? { |pat| !pat.match(input) }
+    def match?(input)
+      return @pats.all? { |pat| pat.match?(input) }
     end
   end
   class AlwaysMatch
-    def match(input)
+    def match?(input)
       return true
     end
   end
@@ -163,8 +163,8 @@ module Linguist
       @pat = pat
     end
-    def match(input)
-      return !@pat.match(input)
+    def match?(input)
+      return !@pat.match?(input)
     end
   end