RubyGems - github-linguist - Versions diffs - 2.10.0 → 2.10.5 - Mend

github-linguist 2.10.0 → 2.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/linguist/blob_helper.rb +2 -2
data/lib/linguist/classifier.rb +15 -15
data/lib/linguist/generated.rb +9 -0
data/lib/linguist/language.rb +60 -4
data/lib/linguist/languages.yml +114 -6
data/lib/linguist/samples.json +1515 -104
data/lib/linguist/samples.rb +46 -0
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d32dfca063a58b4fdda048d20fbf9ee735a87a47
-  data.tar.gz: 8603ce6993bb645e7d847f991142436d999c5e96
+  metadata.gz: b0ecbabc138b17a77febdabc0c64f378f67bd13c
+  data.tar.gz: ae91bedb24a96d1dfcaf4680dcc5530f475deee0
 SHA512:
-  metadata.gz: db134aedc386ccc175b578474a84c75391814b235e01da962b6d2483453cfc80dec48670c5a23c65930fb723c92a0ca48df320c24eeead87fedbe5c6a21a448f
-  data.tar.gz: 89195a956fe06ab069fff7c4bf61c2324cb397964d818fcd7926eb8548b56e58ad73bf566b9cedd88275d10def55e3909e715bd686754b45b83e0aaacbb717b3
+  metadata.gz: ea8ea0349099feaee685a38a6800bef2f4d695b9c422394f5d34c3432da19621912a2f296bd48cc43d989758ff8d45613a4185e86195907b6cf133fd24fa74c3
+  data.tar.gz: cb0dd718502e691e8e09baf8753b17e6dc042f408b5d297f139167c62ae8ff5c9348c9f2498435b7a233ecd7e792ef346920f68f4e11d4b8c2c7738b1637637f

data/lib/linguist/blob_helper.rb CHANGED

@@ -190,9 +190,9 @@ module Linguist
     # Public: Is the blob safe to colorize?
     #
     # We use Pygments for syntax highlighting blobs. Pygments
-    # can be too slow for very large blobs or for certain
+    # can be too slow for very large blobs or for certain
     # corner-case blobs.
-    #
+    #
     # Return true or false
     def safe_to_colorize?
       !large? && text? && !high_ratio_of_long_lines?

data/lib/linguist/classifier.rb CHANGED

@@ -78,18 +78,13 @@ module Linguist
     def classify(tokens, languages)
       return [] if tokens.nil?
       tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
       scores = {}
-      if verbosity >= 2
-        dump_all_tokens(tokens, languages)
-      end
+      debug_dump_all_tokens(tokens, languages) if verbosity >= 2
       languages.each do |language|
-        scores[language] = tokens_probability(tokens, language) +
-                                   language_probability(language)
-        if verbosity >= 1
-          printf "%10s = %10.3f + %7.3f = %10.3f\n",
-            language, tokens_probability(tokens, language), language_probability(language), scores[language]
-        end
+        debug_dump_probabilities(tokens, language) if verbosity >= 1
+        scores[language] = tokens_probability(tokens, language) + language_probability(language)
       end
       scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
@@ -135,6 +130,11 @@ module Linguist
         @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
       end
+      def debug_dump_probabilities
+        printf("%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), scores[language])
+      end
       # Internal: show a table of probabilities for each <token,language> pair.
       #
       # The number in each table entry is the number of "points" that each
@@ -145,22 +145,22 @@ module Linguist
       # how much more likely (log of probability ratio) that token is to
       # appear in one language vs. the least-likely language.  Dashes
       # indicate the least-likely language (and zero points) for each token.
-      def dump_all_tokens(tokens, languages)
+      def debug_dump_all_tokens(tokens, languages)
         maxlen = tokens.map { |tok| tok.size }.max
         printf "%#{maxlen}s", ""
         puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
         token_map = Hash.new(0)
         tokens.each { |tok| token_map[tok] += 1 }
         token_map.sort.each { |tok, count|
           arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
           min = arr.map { |a,b| b }.min
           minlog = Math.log(min)
           if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
             printf "%#{maxlen}s%5d", tok, count
             puts arr.map { |ent|
               ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
             }.join

data/lib/linguist/generated.rb CHANGED

@@ -60,6 +60,7 @@ module Linguist
         generated_net_designer_file? ||
         generated_protocol_buffer? ||
         generated_jni_header? ||
+        composer_lock? ||
         node_modules?
     end
@@ -204,5 +205,13 @@ module Linguist
     def node_modules?
       !!name.match(/node_modules\//)
     end
+    # the php composer tool generates a lock file to represent a specific dependency state.
+    # In general not meant for humans in pull requests.
+    #
+    # Returns true or false.
+    def composer_lock?
+      !!name.match(/composer.lock/)
+    end
   end
 end

data/lib/linguist/language.rb CHANGED

@@ -21,17 +21,27 @@ module Linguist
     @alias_index     = {}
     @extension_index          = Hash.new { |h,k| h[k] = [] }
+    @interpreter_index        = Hash.new { |h,k| h[k] = [] }
     @filename_index           = Hash.new { |h,k| h[k] = [] }
     @primary_extension_index  = {}
     # Valid Languages types
-    TYPES = [:data, :markup, :programming]
+    TYPES = [:data, :markup, :programming, :prose]
     # Names of non-programming languages that we will still detect
     #
     # Returns an array
     def self.detectable_markup
-      ["CSS", "Less", "Sass", "TeX"]
+      ["CSS", "Less", "Sass", "Stylus", "TeX"]
+    end
+    # Detect languages by a specific type
+    #
+    # type - A symbol that exists within TYPES
+    #
+    # Returns an array
+    def self.by_type(type)
+      all.select { |h| h.type == type }
     end
     # Internal: Create a new Language object
@@ -75,6 +85,10 @@ module Linguist
       @primary_extension_index[language.primary_extension] = language
+      language.interpreters.each do |interpreter|
+        @interpreter_index[interpreter] << language
+      end
       language.filenames.each do |filename|
         @filename_index[filename] << language
       end
@@ -103,10 +117,13 @@ module Linguist
       if possible_languages.length > 1
         data = data.call() if data.respond_to?(:call)
         if data.nil? || data == ""
           nil
-        elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
-          Language[result[0]]
+        elsif (result = find_by_shebang(data)) && !result.empty?
+          result.first
+        elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
+          Language[classified[0]]
         end
       else
         possible_languages.first
@@ -166,6 +183,20 @@ module Linguist
       langs.compact.uniq
     end
+    # Public: Look up Languages by shebang line.
+    #
+    # data - Array of tokens or String data to analyze.
+    #
+    # Examples
+    #
+    #   Language.find_by_shebang("#!/bin/bash\ndate;")
+    #   # => [#<Language name="Bash">]
+    #
+    # Returns the matching Language
+    def self.find_by_shebang(data)
+      @interpreter_index[Linguist.interpreter_from_shebang(data)]
+    end
     # Public: Look up Language by its name or lexer.
     #
     # name - The String name of the Language
@@ -251,6 +282,7 @@ module Linguist
       # Set extensions or default to [].
       @extensions = attributes[:extensions] || []
+      @interpreters = attributes[:interpreters]   || []
       @filenames  = attributes[:filenames]  || []
       unless @primary_extension = attributes[:primary_extension]
@@ -363,6 +395,15 @@ module Linguist
     # Returns the extension String.
     attr_reader :primary_extension
+    # Public: Get interpreters
+    #
+    # Examples
+    #
+    #   # => ['awk', 'gawk', 'mawk' ...]
+    #
+    # Returns the interpreters Array
+    attr_reader :interpreters
     # Public: Get filenames
     #
     # Examples
@@ -456,6 +497,7 @@ module Linguist
   end
   extensions = Samples::DATA['extnames']
+  interpreters = Samples::DATA['interpreters']
   filenames = Samples::DATA['filenames']
   popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
@@ -470,6 +512,7 @@ module Linguist
   languages.each do |name, options|
     options['extensions'] ||= []
+    options['interpreters'] ||= []
     options['filenames'] ||= []
     if extnames = extensions[name]
@@ -480,6 +523,18 @@ module Linguist
       end
     end
+    if interpreters == nil
+      interpreters = {}
+    end
+    if interpreter_names = interpreters[name]
+      interpreter_names.each do |interpreter|
+        if !options['interpreters'].include?(interpreter)
+          options['interpreters'] << interpreter
+        end
+      end
+    end
     if fns = filenames[name]
       fns.each do |filename|
         if !options['filenames'].include?(filename)
@@ -500,6 +555,7 @@ module Linguist
       :searchable        => options.key?('searchable') ? options['searchable'] : true,
       :search_term       => options['search_term'],
       :extensions        => options['extensions'].sort,
+      :interpreters      => options['interpreters'].sort,
       :primary_extension => options['primary_extension'],
       :filenames         => options['filenames'],
       :popular           => popular.include?(name)

data/lib/linguist/languages.yml CHANGED

@@ -10,6 +10,7 @@
 # ace_mode          - A String name of Ace Mode (if available)
 # wrap              - Boolean wrap to enable line wrapping (default: false)
 # extension         - An Array of associated extensions
+# interpreter       - An Array of associated interpreters
 # primary_extension - A String for the main extension associated with
 #                     the language. Must be unique. Used when a Language is picked
 #                     from a dropdown and we need to automatically choose an
@@ -22,7 +23,7 @@
 # Any additions or modifications (even trivial) should have corresponding
 # test change in `test/test_blob.rb`.
 #
-# Please keep this list alphabetized.
+# Please keep this list alphabetized. Capitalization comes before lower case.
 ABAP:
   type: programming
@@ -70,6 +71,7 @@ Ada:
 Agda:
   type: programming
+  color: "#467C91"
   primary_extension: .agda
 ApacheConf:
@@ -101,6 +103,16 @@ Arduino:
   lexer: C++
   primary_extension: .ino
+AsciiDoc:
+  type: prose
+  lexer: Text only
+  ace_mode: asciidoc
+  wrap: true
+  primary_extension: .asciidoc
+  extensions:
+  - .adoc
+  - .asc
 Assembly:
   type: programming
   lexer: NASM
@@ -140,6 +152,11 @@ Awk:
   - .gawk
   - .mawk
   - .nawk
+  interpreters:
+  - awk
+  - gawk
+  - mawk
+  - nawk
 Batchfile:
   type: programming
@@ -181,6 +198,11 @@ Brainfuck:
   extensions:
   - .bf
+Brightscript:
+  type: programming
+  lexer: Text only
+  primary_extension: .brs
 Bro:
   type: programming
   primary_extension: .bro
@@ -333,6 +355,12 @@ Common Lisp:
   - .lsp
   - .ny
   - .podsl
+  interpreters:
+  - lisp
+  - sbcl
+  - ccl
+  - clisp
+  - ecl
 Coq:
   type: programming
@@ -346,6 +374,12 @@ Cpp-ObjDump:
   - .c++objdump
   - .cxx-objdump
+Creole:
+  type: prose
+  lexer: Text only
+  wrap: true
+  primary_extension: .creole
 Cucumber:
   lexer: Gherkin
   primary_extension: .feature
@@ -379,7 +413,7 @@ D-ObjDump:
 DM:
   type: programming
   color: "#075ff1"
-  lexer: Text only
+  lexer: C++
   primary_extension: .dm
   aliases:
   - byond
@@ -748,6 +782,7 @@ JSON:
   - .sublime-settings
   - .sublime-workspace
   filenames:
+  - .jshintrc
   - composer.lock
 Jade:
@@ -918,6 +953,8 @@ Makefile:
   - makefile
   - Makefile
   - GNUmakefile
+  interpreters:
+  - make
 Mako:
   primary_extension: .mako
@@ -925,7 +962,7 @@ Mako:
   - .mao
 Markdown:
-  type: markup
+  type: prose
   lexer: Text only
   ace_mode: markdown
   wrap: true
@@ -956,6 +993,12 @@ Max:
   - .mxt
   - .pat
+MediaWiki:
+  type: prose
+  lexer: Text only
+  wrap: true
+  primary_extension: .mediawiki
 MiniD: # Legacy
   searchable: false
   primary_extension: .minid # Dummy extension
@@ -1091,6 +1134,12 @@ OpenEdge ABL:
   - abl
   primary_extension: .p
+Org:
+  type: prose
+  lexer: Text only
+  wrap: true
+  primary_extension: .org
 Oxygene:
   type: programming
   lexer: Text only
@@ -1157,6 +1206,8 @@ Perl:
   - .pm6
   - .pod
   - .psgi
+  interpreters:
+  - perl
 Pike:
   type: programming
@@ -1166,6 +1217,13 @@ Pike:
   extensions:
   - .pmod
+Pod:
+  type: prose
+  lexer: Text only
+  ace_mode: perl
+  wrap: true
+  primary_extension: .pod
 PogoScript:
   type: programming
   color: "#d80074"
@@ -1224,12 +1282,15 @@ Python:
   primary_extension: .py
   extensions:
   - .gyp
+  - .lmi
   - .pyt
   - .pyw
   - .wsgi
   - .xpy
   filenames:
   - wscript
+  interpreters:
+  - python
 Python traceback:
   type: data
@@ -1252,6 +1313,15 @@ R:
   - .R
   filenames:
   - .Rprofile
+  interpreters:
+  - Rscript
+RDoc:
+  type: prose
+  lexer: Text only
+  ace_mode: rdoc
+  wrap: true
+  primary_extension: .rdoc
 REALbasic:
   type: programming
@@ -1269,6 +1339,15 @@ RHTML:
   group: HTML
   primary_extension: .rhtml
+RMarkdown:
+  type: markup
+  lexer: Text only
+  wrap: true
+  ace_mode: markdown
+  primary_extension: .rmd
+  extensions:
+  - .Rmd
 Racket:
   type: programming
   lexer: Racket
@@ -1339,6 +1418,8 @@ Ruby:
   - .ru
   - .thor
   - .watchr
+  interpreters:
+  - ruby
   filenames:
   - Appraisals
   - Berksfile
@@ -1381,6 +1462,8 @@ Scala:
   ace_mode: scala
   color: "#7dd3b0"
   primary_extension: .scala
+  extensions:
+  - .sc
 Scaml:
   group: HTML
@@ -1394,6 +1477,11 @@ Scheme:
   extensions:
   - .sls
   - .ss
+  interpreters:
+  - guile
+  - racket
+  - bigloo
+  - chicken
 Scilab:
   type: programming
@@ -1418,6 +1506,10 @@ Shell:
   extensions:
   - .bats
   - .tmux
+  interpreters:
+  - bash
+  - sh
+  - zsh
   filenames:
   - Dockerfile
@@ -1446,11 +1538,17 @@ Standard ML:
   - sml
   primary_extension: .sml
+Stylus:
+  type: markup
+  group: CSS
+  lexer: Text only
+  primary_extension: .styl
 SuperCollider:
   type: programming
   color: "#46390b"
   lexer: Text only
-  primary_extension: .sc
+  primary_extension: .scd
 TOML:
   type: data
@@ -1477,7 +1575,9 @@ Tcsh:
 TeX:
   type: markup
+  color: "#3D6117"
   ace_mode: latex
+  wrap: true
   aliases:
   - latex
   primary_extension: .tex
@@ -1498,7 +1598,7 @@ Tea:
   primary_extension: .tea
 Textile:
-  type: markup
+  type: prose
   lexer: Text only
   ace_mode: textile
   wrap: true
@@ -1544,6 +1644,14 @@ VHDL:
   lexer: vhdl
   color: "#543978"
   primary_extension: .vhdl
+  extensions:
+  - .vhd
+  - .vhf
+  - .vhi
+  - .vho
+  - .vhs
+  - .vht
+  - .vhw
 Vala:
   type: programming
@@ -1733,7 +1841,7 @@ ooc:
   primary_extension: .ooc
 reStructuredText:
-  type: markup
+  type: prose
   wrap: true
   search_term: rst
   aliases: