RubyGems - gitlab-linguist - Versions diffs - 2.9.5 - Mend

gitlab-linguist 2.9.5

Files changed (17) hide show

checksums.yaml +7 -0
data/bin/linguist +46 -0
data/lib/linguist.rb +5 -0
data/lib/linguist/blob_helper.rb +316 -0
data/lib/linguist/classifier.rb +171 -0
data/lib/linguist/file_blob.rb +56 -0
data/lib/linguist/generated.rb +185 -0
data/lib/linguist/language.rb +495 -0
data/lib/linguist/languages.yml +1585 -0
data/lib/linguist/md5.rb +38 -0
data/lib/linguist/popular.yml +29 -0
data/lib/linguist/repository.rb +95 -0
data/lib/linguist/samples.json +41457 -0
data/lib/linguist/samples.rb +98 -0
data/lib/linguist/tokenizer.rb +198 -0
data/lib/linguist/vendor.yml +129 -0
metadata +171 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1f61bbc6a1106207f7c4791dc3c4bcd83600fa59
+  data.tar.gz: 1433a3391e6247ba26603ba25a7028f3fea9a45f
+SHA512:
+  metadata.gz: 9902468506da9cc6e5a8ddf684001c7e117a0285717aedb625cea00c7ba19f6689131fadd8a10e19cbe02bc666fd1e02bc49afe75b8e37fd8ce95184b43e6e61
+  data.tar.gz: 18eb029e57495598de5b8c9b8d9d630b9160d3c0b2c8d0db4ac212924d0f6952726ee9134a0288ae8a4fec4be529899fac88fa68d39c319ac19a60a3864ee720

data/bin/linguist ADDED

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+# linguist — detect language type for a file, or, given a directory, determine language breakdown
+#
+# usage: linguist <path>
+require 'linguist/file_blob'
+require 'linguist/repository'
+path = ARGV[0] || Dir.pwd
+if File.directory?(path)
+  repo = Linguist::Repository.from_directory(path)
+  repo.languages.sort_by { |_, size| size }.reverse.each do |language, size|
+    percentage = ((size / repo.size.to_f) * 100).round
+    puts "%-4s %s" % ["#{percentage}%", language]
+  end
+elsif File.file?(path)
+  blob = Linguist::FileBlob.new(path, Dir.pwd)
+  type = if blob.text?
+    'Text'
+  elsif blob.image?
+    'Image'
+  else
+    'Binary'
+  end
+  puts "#{blob.name}: #{blob.loc} lines (#{blob.sloc} sloc)"
+  puts "  type:      #{type}"
+  puts "  mime type: #{blob.mime_type}"
+  puts "  language:  #{blob.language}"
+  if blob.large?
+    puts "  blob is too large to be shown"
+  end
+  if blob.generated?
+    puts "  appears to be generated source code"
+  end
+  if blob.vendored?
+    puts "  appears to be a vendored file"
+  end
+else
+  abort "usage: linguist <path>"
+end

data/lib/linguist.rb ADDED

@@ -0,0 +1,5 @@
+require 'linguist/blob_helper'
+require 'linguist/generated'
+require 'linguist/language'
+require 'linguist/repository'
+require 'linguist/samples'

data/lib/linguist/blob_helper.rb ADDED

@@ -0,0 +1,316 @@
+require 'linguist/generated'
+require 'linguist/language'
+require 'charlock_holmes'
+require 'escape_utils'
+require 'mime/types'
+require 'pygments'
+require 'yaml'
+module Linguist
+  # DEPRECATED Avoid mixing into Blob classes. Prefer functional interfaces
+  # like `Language.detect` over `Blob#language`. Functions are much easier to
+  # cache and compose.
+  #
+  # Avoid adding additional bloat to this module.
+  #
+  # BlobHelper is a mixin for Blobish classes that respond to "name",
+  # "data" and "size" such as Grit::Blob.
+  module BlobHelper
+    # Public: Get the extname of the path
+    #
+    # Examples
+    #
+    #   blob(name='foo.rb').extname
+    #   # => '.rb'
+    #
+    # Returns a String
+    def extname
+      File.extname(name.to_s)
+    end
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } ||
+          # Otherwise use the first guess
+          guesses.first
+      end
+    end
+    # Public: Get the actual blob mime type
+    #
+    # Examples
+    #
+    #   # => 'text/plain'
+    #   # => 'text/html'
+    #
+    # Returns a mime type String.
+    def mime_type
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+    # Internal: Is the blob binary according to its mime type
+    #
+    # Return true or false
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
+    end
+    # Internal: Is the blob binary according to its mime type,
+    # overriding it if we have better data from the languages.yml
+    # database.
+    #
+    # Return true or false
+    def likely_binary?
+      binary_mime_type? && !Language.find_by_filename(name)
+    end
+    # Public: Get the Content-Type header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    # Examples
+    #
+    #   # => 'text/plain; charset=utf-8'
+    #   # => 'application/octet-stream'
+    #
+    # Returns a content type String.
+    def content_type
+      @content_type ||= (binary_mime_type? || binary?) ? mime_type :
+        (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
+    end
+    # Public: Get the Content-Disposition header value
+    #
+    # This value is used when serving raw blobs.
+    #
+    #   # => "attachment; filename=file.tar"
+    #   # => "inline"
+    #
+    # Returns a content disposition String.
+    def disposition
+      if text? || image?
+        'inline'
+      elsif name.nil?
+        "attachment"
+      else
+        "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
+      end
+    end
+    def encoding
+      if hash = detect_encoding
+        hash[:encoding]
+      end
+    end
+    # Try to guess the encoding
+    #
+    # Returns: a Hash, with :encoding, :confidence, :type
+    #          this will return nil if an error occurred during detection or
+    #          no valid encoding could be found
+    def detect_encoding
+      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
+    end
+    # Public: Is the blob binary?
+    #
+    # Return true or false
+    def binary?
+      # Large blobs aren't even loaded into memory
+      if data.nil?
+        true
+      # Treat blank files as text
+      elsif data == ""
+        false
+      # Charlock doesn't know what to think
+      elsif encoding.nil?
+        true
+      # If Charlock says its binary
+      else
+        detect_encoding[:type] == :binary
+      end
+    end
+    # Public: Is the blob text?
+    #
+    # Return true or false
+    def text?
+      !binary?
+    end
+    # Public: Is the blob a supported image format?
+    #
+    # Return true or false
+    def image?
+      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
+    end
+    # Public: Is the blob a supported 3D model format?
+    #
+    # Return true or false
+    def solid?
+      extname.downcase == '.stl'
+    end
+    # Public: Is this blob a CSV file?
+    #
+    # Return true or false
+    def csv?
+      text? && extname.downcase == '.csv'
+    end
+    # Public: Is the blob a PDF?
+    #
+    # Return true or false
+    def pdf?
+      extname.downcase == '.pdf'
+    end
+    MEGABYTE = 1024 * 1024
+    # Public: Is the blob too big to load?
+    #
+    # Return true or false
+    def large?
+      size.to_i > MEGABYTE
+    end
+    # Public: Is the blob safe to colorize?
+    #
+    # We use Pygments for syntax highlighting blobs. Pygments
+    # can be too slow for very large blobs or for certain
+    # corner-case blobs.
+    #
+    # Return true or false
+    def safe_to_colorize?
+      !large? && text? && !high_ratio_of_long_lines?
+    end
+    # Internal: Does the blob have a ratio of long lines?
+    #
+    # These types of files are usually going to make Pygments.rb
+    # angry if we try to colorize them.
+    #
+    # Return true or false
+    def high_ratio_of_long_lines?
+      return false if loc == 0
+      size / loc > 5000
+    end
+    # Public: Is the blob viewable?
+    #
+    # Non-viewable blobs will just show a "View Raw" link
+    #
+    # Return true or false
+    def viewable?
+      !large? && text?
+    end
+    vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
+    VendoredRegexp = Regexp.new(vendored_paths.join('|'))
+    # Public: Is the blob in a vendored directory?
+    #
+    # Vendored files are ignored by language statistics.
+    #
+    # See "vendor.yml" for a list of vendored conventions that match
+    # this pattern.
+    #
+    # Return true or false
+    def vendored?
+      name =~ VendoredRegexp ? true : false
+    end
+    # Public: Get each line of data
+    #
+    # Requires Blob#data
+    #
+    # Returns an Array of lines
+    def lines
+      @lines ||=
+        if viewable? && data
+          data.split(/\r\n|\r|\n/, -1)
+        else
+          []
+        end
+    end
+    # Public: Get number of lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def loc
+      lines.size
+    end
+    # Public: Get number of source lines of code
+    #
+    # Requires Blob#data
+    #
+    # Returns Integer
+    def sloc
+      lines.grep(/\S/).size
+    end
+    # Public: Is the blob a generated file?
+    #
+    # Generated source code is suppressed in diffs and is ignored by
+    # language statistics.
+    #
+    # May load Blob#data
+    #
+    # Return true or false
+    def generated?
+      @_generated ||= Generated.generated?(name, lambda { data })
+    end
+    # Public: Detects the Language of the blob.
+    #
+    # May load Blob#data
+    #
+    # Returns a Language or nil if none is detected
+    def language
+      return @language if defined? @language
+      if defined?(@data) && @data.is_a?(String)
+        data = @data
+      else
+        data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
+      end
+      @language = Language.detect(name.to_s, data, mode)
+    end
+    # Internal: Get the lexer of the blob.
+    #
+    # Returns a Lexer.
+    def lexer
+      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
+    end
+    # Public: Highlight syntax of blob
+    #
+    # options - A Hash of options (defaults to {})
+    #
+    # Returns html String
+    def colorize(options = {})
+      return unless safe_to_colorize?
+      options[:options] ||= {}
+      options[:options][:encoding] ||= encoding
+      lexer.highlight(data, options)
+    end
+  end
+end

data/lib/linguist/classifier.rb ADDED

@@ -0,0 +1,171 @@
+require 'linguist/tokenizer'
+module Linguist
+  # Language bayesian classifier.
+  class Classifier
+    # Public: Train classifier that data is a certain language.
+    #
+    # db       - Hash classifier database object
+    # language - String language of data
+    # data     - String contents of file
+    #
+    # Examples
+    #
+    #   Classifier.train(db, 'Ruby', "def hello; end")
+    #
+    # Returns nothing.
+    #
+    # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
+    # per-language.  See also dump_all_tokens, below.
+    def self.train!(db, language, data)
+      tokens = Tokenizer.tokenize(data)
+      db['tokens_total'] ||= 0
+      db['languages_total'] ||= 0
+      db['tokens'] ||= {}
+      db['language_tokens'] ||= {}
+      db['languages'] ||= {}
+      tokens.each do |token|
+        db['tokens'][language] ||= {}
+        db['tokens'][language][token] ||= 0
+        db['tokens'][language][token] += 1
+        db['language_tokens'][language] ||= 0
+        db['language_tokens'][language] += 1
+        db['tokens_total'] += 1
+      end
+      db['languages'][language] ||= 0
+      db['languages'][language] += 1
+      db['languages_total'] += 1
+      nil
+    end
+    # Public: Guess language of data.
+    #
+    # db        - Hash of classifier tokens database.
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Examples
+    #
+    #   Classifier.classify(db, "def hello; end")
+    #   # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def self.classify(db, tokens, languages = nil)
+      languages ||= db['languages'].keys
+      new(db).classify(tokens, languages)
+    end
+    # Internal: Initialize a Classifier.
+    def initialize(db = {})
+      @tokens_total    = db['tokens_total']
+      @languages_total = db['languages_total']
+      @tokens          = db['tokens']
+      @language_tokens = db['language_tokens']
+      @languages       = db['languages']
+    end
+    # Internal: Guess language of data
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def classify(tokens, languages)
+      return [] if tokens.nil?
+      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
+      scores = {}
+      if verbosity >= 2
+        dump_all_tokens(tokens, languages)
+      end
+      languages.each do |language|
+        scores[language] = tokens_probability(tokens, language) +
+                                   language_probability(language)
+        if verbosity >= 1
+          printf "%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), scores[language]
+        end
+      end
+      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
+    end
+    # Internal: Probably of set of tokens in a language occurring - P(D | C)
+    #
+    # tokens   - Array of String tokens.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def tokens_probability(tokens, language)
+      tokens.inject(0.0) do |sum, token|
+        sum += Math.log(token_probability(token, language))
+      end
+    end
+    # Internal: Probably of token in language occurring - P(F | C)
+    #
+    # token    - String token.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def token_probability(token, language)
+      if @tokens[language][token].to_f == 0.0
+        1 / @tokens_total.to_f
+      else
+        @tokens[language][token].to_f / @language_tokens[language].to_f
+      end
+    end
+    # Internal: Probably of a language occurring - P(C)
+    #
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def language_probability(language)
+      Math.log(@languages[language].to_f / @languages_total.to_f)
+    end
+    private
+      def verbosity
+        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
+      end
+      # Internal: show a table of probabilities for each <token,language> pair.
+      #
+      # The number in each table entry is the number of "points" that each
+      # token contributes toward the belief that the file under test is a
+      # particular language.  Points are additive.
+      #
+      # Points are the number of times a token appears in the file, times
+      # how much more likely (log of probability ratio) that token is to
+      # appear in one language vs. the least-likely language.  Dashes
+      # indicate the least-likely language (and zero points) for each token.
+      def dump_all_tokens(tokens, languages)
+        maxlen = tokens.map { |tok| tok.size }.max
+        printf "%#{maxlen}s", ""
+        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
+        tokmap = Hash.new(0)
+        tokens.each { |tok| tokmap[tok] += 1 }
+        tokmap.sort.each { |tok, count|
+          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
+          min = arr.map { |a,b| b }.min
+          minlog = Math.log(min)
+          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
+            printf "%#{maxlen}s%5d", tok, count
+            puts arr.map { |ent|
+              ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
+            }.join
+          end
+        }
+      end
+  end
+end