RubyGems - gitlab-linguist - Versions diffs - 2.9.5 - Mend

gitlab-linguist 2.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/bin/linguist +46 -0
data/lib/linguist.rb +5 -0
data/lib/linguist/blob_helper.rb +316 -0
data/lib/linguist/classifier.rb +171 -0
data/lib/linguist/file_blob.rb +56 -0
data/lib/linguist/generated.rb +185 -0
data/lib/linguist/language.rb +495 -0
data/lib/linguist/languages.yml +1585 -0
data/lib/linguist/md5.rb +38 -0
data/lib/linguist/popular.yml +29 -0
data/lib/linguist/repository.rb +95 -0
data/lib/linguist/samples.json +41457 -0
data/lib/linguist/samples.rb +98 -0
data/lib/linguist/tokenizer.rb +198 -0
data/lib/linguist/vendor.yml +129 -0
metadata +171 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1f61bbc6a1106207f7c4791dc3c4bcd83600fa59
+  data.tar.gz: 1433a3391e6247ba26603ba25a7028f3fea9a45f
+SHA512:
+  metadata.gz: 9902468506da9cc6e5a8ddf684001c7e117a0285717aedb625cea00c7ba19f6689131fadd8a10e19cbe02bc666fd1e02bc49afe75b8e37fd8ce95184b43e6e61
+  data.tar.gz: 18eb029e57495598de5b8c9b8d9d630b9160d3c0b2c8d0db4ac212924d0f6952726ee9134a0288ae8a4fec4be529899fac88fa68d39c319ac19a60a3864ee720

data/bin/linguist ADDED

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+# linguist — detect language type for a file, or, given a directory, determine language breakdown
+#
+# usage: linguist <path>
+require 'linguist/file_blob'
+require 'linguist/repository'
+path = ARGV[0] || Dir.pwd
+if File.directory?(path)
+  repo = Linguist::Repository.from_directory(path)
+  repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
+    percentage = ((size / repo.size.to_f) * 100).round
+    puts "%-4s %s" % ["#{percentage}%", language]
+  end
+elsif File.file?(path)
+  blob = Linguist::FileBlob.new(path, Dir.pwd)
+  type = if blob.text?
+    'Text'
+  elsif blob.image?
+    'Image'
+  else
+    'Binary'
+  end
+  puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
+  puts "  type:      #{type}"
+  puts "  mime type: #{blob.mime_type}"
+  puts "  language:  #{blob.language}"
+  if blob.large?
+    puts "  blob is too large to be shown"
+  end
+  if blob.generated?
+    puts "  appears to be generated source code"
+  end
+  if blob.vendored?
+    puts "  appears to be a vendored file"
+  end
+else
+  abort "usage: linguist <path>"
+end

data/lib/linguist.rb ADDED

@@ -0,0 +1,5 @@
+require 'linguist/blob_helper'
+require 'linguist/generated'
+require 'linguist/language'
+require 'linguist/repository'
+require 'linguist/samples'

data/lib/linguist/blob_helper.rb ADDED

@@ -0,0 +1,316 @@
+require 'linguist/generated'
+require 'linguist/language'
+require 'charlock_holmes'
+require 'escape_utils'
+require 'mime/types'
+require 'pygments'
+require 'yaml'
+module Linguist
+  # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
+  # like `Language.detect` over `Blob#language`. Functions are much easier to
+  # cache and compose.
+  #
+  # Avoid adding additional bloat to this module.
+  #
+  # BlobHelper is a mixin for Blobish classes that respond to "name",
+  # "data" and "size" such as Grit::Blob.
+  module BlobHelper
+    # Public: Get the extname of the path
+    #
+    # Examples
+    #
+    #   blob(name='foo.rb').extname
+    #   # => '.rb'
+    #
+    # Returns a String
+    def extname
+      File.extname(name.to_s)
+    end
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } ||
+          # Otherwise use the first guess
+          guesses.first
+      end
+    end
+    # Public: Get the actual blob mime type
+    #
+    # Examples
+    #
+    #   # => 'text/plain'
+    #   # => 'text/html'
+    #
+    # Returns a mime type String.
+    def mime_type
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+    # Internal: Is the blob binary according to its mime type
+    #
+    # Return true or false
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
+    end
+    # Internal: Is the blob binary according to its mime type,
+    # overriding it if we have better data from the languages.yml
+    # database.
+    #
+    # Return true or false
+    def likely_binary?
+      binary_mime_type? && !Language.find_by_filename(name)
+    end
+    # Public: Get the Content-Type header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    # Examples
+    #
+    #   # => 'text/plain; charset=utf-8'
+    #   # => 'application/octet-stream'
+    #
+    # Returns a content type String.
+    def content_type
+      @content_type ||= (binary_mime_type? || binary?) ? mime_type :
+        (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
+    end
+    # Public: Get the Content-Disposition header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    #   # => "attachment; filename=file.tar"
+    #   # => "inline"
+    #
+    # Returns a content disposition String.
+    def disposition
+      if text? || image?
+        'inline'
+      elsif name.nil?
+        "attachment"
+      else
+        "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
+      end
+    end
+    def encoding
+      if hash = detect_encoding
+        hash[:encoding]
+      end
+    end
+    # Try to guess the encoding
+    #
+    # Returns: a Hash, with :encoding, :confidence, :type
+    #          this will return nil if an error occurred during detection or
+    #          no valid encoding could be found
+    def detect_encoding
+      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
+    end
+    # Public: Is the blob binary?
+    #
+    # Return true or false
+    def binary?
+      # Large blobs aren't even loaded into memory
+      if data.nil?
+        true
+      # Treat blank files as text
+      elsif data == ""
+        false
+      # Charlock doesn't know what to think
+      elsif encoding.nil?
+        true
+      # If Charlock says its binary
+      else
+        detect_encoding[:type] == :binary
+      end
+    end
+    # Public: Is the blob text?
+    #
+    # Return true or false
+    def text?
+      !binary?
+    end
+    # Public: Is the blob a supported image format?
+    #
+    # Return true or false
+    def image?
+      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
+    end
+    # Public: Is the blob a supported 3D model format?
+    #
+    # Return true or false
+    def solid?
+      extname.downcase == '.stl'
+    end
+    # Public: Is this blob a CSV file?
+    #
+    # Return true or false
+    def csv?
+      text? && extname.downcase == '.csv'
+    end
+    # Public: Is the blob a PDF?
+    #
+    # Return true or false
+    def pdf?
+      extname.downcase == '.pdf'
+    end
+    MEGABYTE = 1024 * 1024
+    # Public: Is the blob too big to load?
+    #
+    # Return true or false
+    def large?
+      size.to_i > MEGABYTE
+    end
+    # Public: Is the blob safe to colorize?
+    #
+    # We use Pygments for syntax highlighting blobs. Pygments
+    # can be too slow for very large blobs or for certain
+    # corner-case blobs.
+    #
+    # Return true or false
+    def safe_to_colorize?
+      !large? && text? && !high_ratio_of_long_lines?
+    end
+    # Internal: Does the blob have a ratio of long lines?
+    #
+    # These types of files are usually going to make Pygments.rb
+    # angry if we try to colorize them.
+    #
+    # Return true or false
+    def high_ratio_of_long_lines?
+      return false if loc == 0
+      size / loc > 5000
+    end
+    # Public: Is the blob viewable?
+    #
+    # Non-viewable blobs will just show a "View Raw" link
+    #
+    # Return true or false
+    def viewable?
+      !large? && text?
+    end
+    vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
+    VendoredRegexp = Regexp.new(vendored_paths.join('|'))
+    # Public: Is the blob in a vendored directory?
+    #
+    # Vendored files are ignored by language statistics.
+    #
+    # See "vendor.yml" for a list of vendored conventions that match
+    # this pattern.
+    #
+    # Return true or false
+    def vendored?
+      name =~ VendoredRegexp ? true : false
+    end
+    # Public: Get each line of data
+    #
+    # Requires Blob#data
+    #
+    # Returns an Array of lines
+    def lines
+      @lines ||=
+        if viewable? && data
+          data.split(/\r\n|\r|\n/, -1)
+        else
+          []
+        end
+    end
+    # Public: Get number of lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def loc
+      lines.size
+    end
+    # Public: Get number of source lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def sloc
+      lines.grep(/\S/).size
+    end
+    # Public: Is the blob a generated file?
+    #
+    # Generated source code is suppressed in diffs and is ignored by
+    # language statistics.
+    #
+    # May load Blob#data
+    #
+    # Return true or false
+    def generated?
+      @_generated ||= Generated.generated?(name, lambda { data })
+    end
+    # Public: Detects the Language of the blob.
+    #
+    # May load Blob#data
+    #
+    # Returns a Language or nil if none is detected
+    def language
+      return @language if defined? @language
+      if defined?(@data) && @data.is_a?(String)
+        data = @data
+      else
+        data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
+      end
+      @language = Language.detect(name.to_s, data, mode)
+    end
+    # Internal: Get the lexer of the blob.
+    #
+    # Returns a Lexer.
+    def lexer
+      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
+    end
+    # Public: Highlight syntax of blob
+    #
+    # options - A Hash of options (defaults to {})
+    #
+    # Returns html String
+    def colorize(options = {})
+      return unless safe_to_colorize?
+      options[:options] ||= {}
+      options[:options][:encoding] ||= encoding
+      lexer.highlight(data, options)
+    end
+  end
+end

data/lib/linguist/classifier.rb ADDED

@@ -0,0 +1,171 @@
+require 'linguist/tokenizer'
+module Linguist
+  # Language bayesian classifier.
+  class Classifier
+    # Public: Train classifier that data is a certain language.
+    #
+    # db       - Hash classifier database object
+    # language - String language of data
+    # data     - String contents of file
+    #
+    # Examples
+    #
+    #   Classifier.train(db, 'Ruby', "def hello; end")
+    #
+    # Returns nothing.
+    #
+    # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
+    # per-language.  See also dump_all_tokens, below.
+    def self.train!(db, language, data)
+      tokens = Tokenizer.tokenize(data)
+      db['tokens_total'] ||= 0
+      db['languages_total'] ||= 0
+      db['tokens'] ||= {}
+      db['language_tokens'] ||= {}
+      db['languages'] ||= {}
+      tokens.each do |token|
+        db['tokens'][language] ||= {}
+        db['tokens'][language][token] ||= 0
+        db['tokens'][language][token] += 1
+        db['language_tokens'][language] ||= 0
+        db['language_tokens'][language] += 1
+        db['tokens_total'] += 1
+      end
+      db['languages'][language] ||= 0
+      db['languages'][language] += 1
+      db['languages_total'] += 1
+      nil
+    end
+    # Public: Guess language of data.
+    #
+    # db        - Hash of classifier tokens database.
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Examples
+    #
+    #   Classifier.classify(db, "def hello; end")
+    #   # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def self.classify(db, tokens, languages = nil)
+      languages ||= db['languages'].keys
+      new(db).classify(tokens, languages)
+    end
+    # Internal: Initialize a Classifier.
+    def initialize(db = {})
+      @tokens_total    = db['tokens_total']
+      @languages_total = db['languages_total']
+      @tokens          = db['tokens']
+      @language_tokens = db['language_tokens']
+      @languages       = db['languages']
+    end
+    # Internal: Guess language of data
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def classify(tokens, languages)
+      return [] if tokens.nil?
+      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
+      scores = {}
+      if verbosity >= 2
+        dump_all_tokens(tokens, languages)
+      end
+      languages.each do |language|
+        scores[language] = tokens_probability(tokens, language) +
+                                   language_probability(language)
+        if verbosity >= 1
+          printf "%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), scores[language]
+        end
+      end
+      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
+    end
+    # Internal: Probably of set of tokens in a language occurring - P(D | C)
+    #
+    # tokens   - Array of String tokens.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def tokens_probability(tokens, language)
+      tokens.inject(0.0) do |sum, token|
+        sum += Math.log(token_probability(token, language))
+      end
+    end
+    # Internal: Probably of token in language occurring - P(F | C)
+    #
+    # token    - String token.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def token_probability(token, language)
+      if @tokens[language][token].to_f == 0.0
+        1 / @tokens_total.to_f
+      else
+        @tokens[language][token].to_f / @language_tokens[language].to_f
+      end
+    end
+    # Internal: Probably of a language occurring - P(C)
+    #
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def language_probability(language)
+      Math.log(@languages[language].to_f / @languages_total.to_f)
+    end
+    private
+      def verbosity
+        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
+      end
+      # Internal: show a table of probabilities for each <token,language> pair.
+      #
+      # The number in each table entry is the number of "points" that each
+      # token contributes toward the belief that the file under test is a
+      # particular language.  Points are additive.
+      #
+      # Points are the number of times a token appears in the file, times
+      # how much more likely (log of probability ratio) that token is to
+      # appear in one language vs. the least-likely language.  Dashes
+      # indicate the least-likely language (and zero points) for each token.
+      def dump_all_tokens(tokens, languages)
+        maxlen = tokens.map { |tok| tok.size }.max
+        printf "%#{maxlen}s", ""
+        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
+        tokmap = Hash.new(0)
+        tokens.each { |tok| tokmap[tok] += 1 }
+        tokmap.sort.each { |tok, count|
+          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
+          min = arr.map { |a,b| b }.min
+          minlog = Math.log(min)
+          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
+            printf "%#{maxlen}s%5d", tok, count
+            puts arr.map { |ent|
+              ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
+            }.join
+          end
+        }
+      end
+  end
+end