RubyGems - github-linguist - Versions diffs - 2.1.2 → 2.2.0 - Mend

github-linguist 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/lib/linguist.rb +1 -0
data/lib/linguist/blob_helper.rb +7 -241
data/lib/linguist/generated.rb +161 -0
data/lib/linguist/language.rb +37 -54
data/lib/linguist/languages.yml +5 -41
data/lib/linguist/samples.json +12055 -5573
data/lib/linguist/samples.rb +7 -5
data/lib/linguist/tokenizer.rb +47 -5
metadata +3 -2

data/lib/linguist.rb CHANGED

@@ -1,4 +1,5 @@
 require 'linguist/blob_helper'
+require 'linguist/generated'
 require 'linguist/language'
 require 'linguist/mime'
 require 'linguist/repository'

data/lib/linguist/blob_helper.rb CHANGED

@@ -1,7 +1,6 @@
-require 'linguist/classifier'
+require 'linguist/generated'
 require 'linguist/language'
 require 'linguist/mime'
-require 'linguist/samples'
 require 'charlock_holmes'
 require 'escape_utils'
@@ -129,15 +128,6 @@ module Linguist
       ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
     end
-    # Public: Is the blob likely to have a shebang?
-    #
-    # Return true or false
-    def shebang_extname?
-      extname.empty? &&
-        mode &&
-        (mode.to_i(8) & 05) == 05
-    end
     MEGABYTE = 1024 * 1024
     # Public: Is the blob too big to load?
@@ -221,143 +211,16 @@ module Linguist
       lines.grep(/\S/).size
     end
-    # Internal: Compute average line length.
-    #
-    # Returns Integer.
-    def average_line_length
-      if lines.any?
-        lines.inject(0) { |n, l| n += l.length } / lines.length
-      else
-        0
-      end
-    end
     # Public: Is the blob a generated file?
     #
     # Generated source code is supressed in diffs and is ignored by
     # language statistics.
     #
-    # Requires Blob#data
-    #
-    # Includes:
-    # - XCode project XML files
-    # - Minified JavaScript
-    # - Compiled CoffeeScript
-    # - PEG.js-generated parsers
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_generated` if you make any changes.
+    # May load Blob#data
     #
     # Return true or false
     def generated?
-      if name == 'Gemfile.lock' || minified_javascript? || compiled_coffeescript? ||
-      xcode_project_file? || generated_net_docfile? || generated_parser?
-        true
-      else
-        false
-      end
-    end
-    # Internal: Is the blob an XCode project file?
-    #
-    # Generated if the file extension is an XCode project
-    # file extension.
-    #
-    # Returns true of false.
-    def xcode_project_file?
-      ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
-    end
-    # Internal: Is the blob minified JS?
-    #
-    # Consider JS minified if the average line length is
-    # greater then 100c.
-    #
-    # Returns true or false.
-    def minified_javascript?
-      return unless extname == '.js'
-      average_line_length > 100
-    end
-    # Internal: Is the blob of JS a parser generated by PEG.js?
-    #
-    # Requires Blob#data
-    #
-    # PEG.js-generated parsers are not meant to be consumed by humans.
-    #
-    # Return true or false
-    def generated_parser?
-      return false unless extname == '.js'
-      # PEG.js-generated parsers include a comment near the top  of the file
-      # that marks them as such.
-      if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
-        return true
-      end
-      false
-    end
-    # Internal: Is the blob of JS generated by CoffeeScript?
-    #
-    # Requires Blob#data
-    #
-    # CoffeScript is meant to output JS that would be difficult to
-    # tell if it was generated or not. Look for a number of patterns
-    # output by the CS compiler.
-    #
-    # Return true or false
-    def compiled_coffeescript?
-      return false unless extname == '.js'
-      # CoffeeScript generated by > 1.2 include a comment on the first line
-      if lines[0] =~ /^\/\/ Generated by /
-        return true
-      end
-      if lines[0] == '(function() {' &&     # First line is module closure opening
-          lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
-          lines[-1] == ''                   # Last line is blank
-        score = 0
-        lines.each do |line|
-          if line =~ /var /
-            # Underscored temp vars are likely to be Coffee
-            score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
-            # bind and extend functions are very Coffee specific
-            score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
-          end
-        end
-        # Require a score of 3. This is fairly arbitrary. Consider
-        # tweaking later.
-        score >= 3
-      else
-        false
-      end
-    end
-    # Internal: Is this a generated documentation file for a .NET assembly?
-    #
-    # Requires Blob#data
-    #
-    # .NET developers often check in the XML Intellisense file along with an
-    # assembly - however, these don't have a special extension, so we have to
-    # dig into the contents to determine if it's a docfile. Luckily, these files
-    # are extremely structured, so recognizing them is easy.
-    #
-    # Returns true or false
-    def generated_net_docfile?
-      return false unless extname.downcase == ".xml"
-      return false unless lines.count > 3
-      # .NET Docfiles always open with <doc> and their first tag is an
-      # <assembly> tag
-      return lines[1].include?("<doc>") &&
-        lines[2].include?("<assembly>") &&
-        lines[-2].include?("</doc>")
+      @_generated ||= Generated.generated?(name, lambda { data })
     end
     # Public: Should the blob be indexed for searching?
@@ -375,6 +238,8 @@ module Linguist
     def indexable?
       if binary?
         false
+      elsif extname == '.txt'
+        true
       elsif language.nil?
         false
       elsif !language.searchable?
@@ -396,30 +261,11 @@ module Linguist
     def language
       if defined? @language
         @language
-      else
-        @language = guess_language
+      elsif !binary_mime_type?
+        @language = Language.detect(name.to_s, lambda { data }, mode)
       end
     end
-    # Internal: Guess language
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_language` if you make any changes.
-    #
-    # Returns a Language or nil
-    def guess_language
-      return if binary_mime_type?
-      # Disambiguate between multiple language extensions
-      disambiguate_extension_language ||
-        # See if there is a Language for the extension
-        Language.find_by_filename(name.to_s) ||
-        # Try to detect Language from shebang line
-        shebang_language
-    end
     # Internal: Get the lexer of the blob.
     #
     # Returns a Lexer.
@@ -427,86 +273,6 @@ module Linguist
       language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
     end
-    # Internal: Disambiguates between multiple language extensions.
-    #
-    # Returns a Language or nil.
-    def disambiguate_extension_language
-      if Language.ambiguous?(extname)
-        possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
-        if possible_languages.any?
-          if result = Classifier.classify(Samples::DATA, data, possible_languages).first
-            Language[result[0]]
-          end
-        end
-      end
-    end
-    # Internal: Extract the script name from the shebang line
-    #
-    # Requires Blob#data
-    #
-    # Examples
-    #
-    #   '#!/usr/bin/ruby'
-    #   # => 'ruby'
-    #
-    #   '#!/usr/bin/env ruby'
-    #   # => 'ruby'
-    #
-    #   '#!/usr/bash/python2.4'
-    #   # => 'python'
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_shebang_script` if you make any changes.
-    #
-    # Returns a script name String or nil
-    def shebang_script
-      # Fail fast if blob isn't viewable?
-      return unless viewable?
-      if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
-        bang.sub!(/^#! /, '#!')
-        tokens = bang.split(' ')
-        pieces = tokens.first.split('/')
-        if pieces.size > 1
-          script = pieces.last
-        else
-          script = pieces.first.sub('#!', '')
-        end
-        script = script == 'env' ? tokens[1] : script
-        # python2.4 => python
-        if script =~ /((?:\d+\.?)+)/
-          script.sub! $1, ''
-        end
-        # Check for multiline shebang hacks that exec themselves
-        #
-        #   #!/bin/sh
-        #   exec foo "$0" "$@"
-        #
-        if script == 'sh' &&
-            lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
-          script = $1
-        end
-        script
-      end
-    end
-    # Internal: Get Language for shebang script
-    #
-    # Returns the Language or nil
-    def shebang_language
-      # Skip file extensions unlikely to have shebangs
-      return unless shebang_extname?
-      if script = shebang_script
-        Language[script]
-      end
-    end
     # Public: Highlight syntax of blob
     #
     # options - A Hash of options (defaults to {})

data/lib/linguist/generated.rb ADDED

@@ -0,0 +1,161 @@
+module Linguist
+  class Generated
+    # Public: Is the blob a generated file?
+    #
+    # name - String filename
+    # data - String blob data. A block also maybe passed in for lazy
+    #        loading. This behavior is deprecated and you should always
+    #        pass in a String.
+    #
+    # Return true or false
+    def self.generated?(name, data)
+      new(name, data).generated?
+    end
+    # Internal: Initialize Generated instance
+    #
+    # name - String filename
+    # data - String blob data
+    def initialize(name, data)
+      @name = name
+      @extname = File.extname(name)
+      @_data = data
+    end
+    attr_reader :name, :extname
+    # Lazy load blob data if block was passed in.
+    #
+    # Awful, awful stuff happening here.
+    #
+    # Returns String data.
+    def data
+      @data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
+    end
+    # Public: Get each line of data
+    #
+    # Returns an Array of lines
+    def lines
+      @lines ||= data.split("\n", -1)
+    end
+    # Internal: Is the blob a generated file?
+    #
+    # Generated source code is supressed in diffs and is ignored by
+    # language statistics.
+    #
+    # Please add additional test coverage to
+    # `test/test_blob.rb#test_generated` if you make any changes.
+    #
+    # Return true or false
+    def generated?
+      name == 'Gemfile.lock' ||
+        minified_javascript? ||
+        compiled_coffeescript? ||
+        xcode_project_file? ||
+        generated_net_docfile? ||
+        generated_parser?
+    end
+    # Internal: Is the blob an XCode project file?
+    #
+    # Generated if the file extension is an XCode project
+    # file extension.
+    #
+    # Returns true of false.
+    def xcode_project_file?
+      ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
+    end
+    # Internal: Is the blob minified JS?
+    #
+    # Consider JS minified if the average line length is
+    # greater then 100c.
+    #
+    # Returns true or false.
+    def minified_javascript?
+      return unless extname == '.js'
+      if lines.any?
+        (lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
+      else
+        false
+      end
+    end
+    # Internal: Is the blob of JS generated by CoffeeScript?
+    #
+    # CoffeScript is meant to output JS that would be difficult to
+    # tell if it was generated or not. Look for a number of patterns
+    # output by the CS compiler.
+    #
+    # Return true or false
+    def compiled_coffeescript?
+      return false unless extname == '.js'
+      # CoffeeScript generated by > 1.2 include a comment on the first line
+      if lines[0] =~ /^\/\/ Generated by /
+        return true
+      end
+      if lines[0] == '(function() {' &&     # First line is module closure opening
+          lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
+          lines[-1] == ''                   # Last line is blank
+        score = 0
+        lines.each do |line|
+          if line =~ /var /
+            # Underscored temp vars are likely to be Coffee
+            score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
+            # bind and extend functions are very Coffee specific
+            score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
+          end
+        end
+        # Require a score of 3. This is fairly arbitrary. Consider
+        # tweaking later.
+        score >= 3
+      else
+        false
+      end
+    end
+    # Internal: Is this a generated documentation file for a .NET assembly?
+    #
+    # .NET developers often check in the XML Intellisense file along with an
+    # assembly - however, these don't have a special extension, so we have to
+    # dig into the contents to determine if it's a docfile. Luckily, these files
+    # are extremely structured, so recognizing them is easy.
+    #
+    # Returns true or false
+    def generated_net_docfile?
+      return false unless extname.downcase == ".xml"
+      return false unless lines.count > 3
+      # .NET Docfiles always open with <doc> and their first tag is an
+      # <assembly> tag
+      return lines[1].include?("<doc>") &&
+        lines[2].include?("<assembly>") &&
+        lines[-2].include?("</doc>")
+    end
+    # Internal: Is the blob of JS a parser generated by PEG.js?
+    #
+    # PEG.js-generated parsers are not meant to be consumed by humans.
+    #
+    # Return true or false
+    def generated_parser?
+      return false unless extname == '.js'
+      # PEG.js-generated parsers include a comment near the top  of the file
+      # that marks them as such.
+      if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
+        return true
+      end
+      false
+    end
+  end
+end