RubyGems - github-linguist - Versions diffs - 5.3.1 → 5.3.2 - Mend

github-linguist 5.3.1 → 5.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/ext/linguist/extconf.rb +3 -0
data/ext/linguist/lex.linguist_yy.c +8269 -0
data/ext/linguist/lex.linguist_yy.h +353 -0
data/ext/linguist/linguist.c +64 -0
data/ext/linguist/linguist.h +11 -0
data/ext/linguist/tokenizer.l +119 -0
data/grammars/source.coffee.json +123 -41
data/grammars/source.crystal.json +2 -2
data/grammars/source.css.less.json +319 -27
data/grammars/source.glsl.json +1 -1
data/grammars/source.js.json +6 -2
data/grammars/source.meson.json +1 -1
data/grammars/source.tsx.json +4 -14
data/grammars/source.wdl.json +2 -2
data/grammars/text.roff.json +155 -41
data/grammars/text.shell-session.json +1 -1
data/lib/linguist/blob_helper.rb +47 -4
data/lib/linguist/classifier.rb +3 -1
data/lib/linguist/file_blob.rb +3 -3
data/lib/linguist/heuristics.rb +15 -6
data/lib/linguist/linguist.bundle +0 -0
data/lib/linguist/samples.json +49989 -44225
data/lib/linguist/strategy/modeline.rb +2 -2
data/lib/linguist/tokenizer.rb +1 -186
data/lib/linguist/version.rb +1 -1
metadata +25 -3

data/lib/linguist/strategy/modeline.rb CHANGED

@@ -109,8 +109,8 @@ module Linguist
       # Returns an Array with one Language if the blob has a Vim or Emacs modeline
       # that matches a Language name or alias. Returns an empty array if no match.
       def self.call(blob, _ = nil)
-        header = blob.lines.first(SEARCH_SCOPE).join("\n")
-        footer = blob.lines.last(SEARCH_SCOPE).join("\n")
+        header = blob.first_lines(SEARCH_SCOPE).join("\n")
+        footer = blob.last_lines(SEARCH_SCOPE).join("\n")
         Array(Language.find_by_alias(modeline(header + footer)))
       end

data/lib/linguist/tokenizer.rb CHANGED

@@ -1,4 +1,5 @@
 require 'strscan'
+require 'linguist/linguist'
 module Linguist
   # Generic programming language tokenizer.
@@ -15,191 +16,5 @@ module Linguist
     def self.tokenize(data)
       new.extract_tokens(data)
     end
-    # Read up to 100KB
-    BYTE_LIMIT = 100_000
-    # Start state on token, ignore anything till the next newline
-    SINGLE_LINE_COMMENTS = [
-      '//', # C
-      '--', # Ada, Haskell, AppleScript
-      '#',  # Ruby
-      '%',  # Tex
-      '"',  # Vim
-    ]
-    # Start state on opening token, ignore anything until the closing
-    # token is reached.
-    MULTI_LINE_COMMENTS = [
-      ['/*', '*/'],    # C
-      ['<!--', '-->'], # XML
-      ['{-', '-}'],    # Haskell
-      ['(*', '*)'],    # Coq
-      ['"""', '"""'],  # Python
-      ["'''", "'''"]   # Python
-    ]
-    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
-      "\s*#{Regexp.escape(c)} "
-    }.join("|"))
-    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
-      Regexp.escape(c[0])
-    }.join("|"))
-    # Internal: Extract generic tokens from data.
-    #
-    # data - String to scan.
-    #
-    # Examples
-    #
-    #   extract_tokens("printf('Hello')")
-    #   # => ['printf', '(', ')']
-    #
-    # Returns Array of token Strings.
-    def extract_tokens(data)
-      s = StringScanner.new(data)
-      tokens = []
-      until s.eos?
-        break if s.pos >= BYTE_LIMIT
-        if token = s.scan(/^#!.+$/)
-          if name = extract_shebang(token)
-            tokens << "SHEBANG#!#{name}"
-          end
-        # Single line comment
-        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
-          # tokens << token.strip
-          s.skip_until(/\n|\Z/)
-        # Multiline comments
-        elsif token = s.scan(START_MULTI_LINE_COMMENT)
-          # tokens << token
-          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
-          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
-          # tokens << close_token
-        # Skip single or double quoted strings
-        elsif s.scan(/"/)
-          if s.peek(1) == "\""
-            s.getch
-          else
-            s.skip_until(/(?<!\\)"/)
-          end
-        elsif s.scan(/'/)
-          if s.peek(1) == "'"
-            s.getch
-          else
-            s.skip_until(/(?<!\\)'/)
-          end
-        # Skip number literals
-        elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
-        # SGML style brackets
-        elsif token = s.scan(/<[^\s<>][^<>]*>/)
-          extract_sgml_tokens(token).each { |t| tokens << t }
-        # Common programming punctuation
-        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
-          tokens << token
-        # Regular token
-        elsif token = s.scan(/[\w\.@#\/\*]+/)
-          tokens << token
-        # Common operators
-        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
-          tokens << token
-        else
-          s.getch
-        end
-      end
-      tokens
-    end
-    # Internal: Extract normalized shebang command token.
-    #
-    # Examples
-    #
-    #   extract_shebang("#!/usr/bin/ruby")
-    #   # => "ruby"
-    #
-    #   extract_shebang("#!/usr/bin/env node")
-    #   # => "node"
-    #
-    #   extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
-    #   # => "awk"
-    #
-    # Returns String token or nil it couldn't be parsed.
-    def extract_shebang(data)
-      s = StringScanner.new(data)
-      if path = s.scan(/^#!\s*\S+/)
-        script = path.split('/').last
-        if script == 'env'
-          s.scan(/\s+/)
-          s.scan(/.*=[^\s]+\s+/)
-          script = s.scan(/\S+/)
-        end
-        script = script[/[^\d]+/, 0] if script
-        return script
-      end
-      nil
-    end
-    # Internal: Extract tokens from inside SGML tag.
-    #
-    # data - SGML tag String.
-    #
-    # Examples
-    #
-    #   extract_sgml_tokens("<a href='' class=foo>")
-    #   # => ["<a>", "href="]
-    #
-    # Returns Array of token Strings.
-    def extract_sgml_tokens(data)
-      s = StringScanner.new(data)
-      tokens = []
-      until s.eos?
-        # Emit start token
-        if token = s.scan(/<\/?[^\s>]+/)
-          tokens << "#{token}>"
-        # Emit attributes with trailing =
-        elsif token = s.scan(/\w+=/)
-          tokens << token
-          # Then skip over attribute value
-          if s.scan(/"/)
-            s.skip_until(/[^\\]"/)
-          elsif s.scan(/'/)
-            s.skip_until(/[^\\]'/)
-          else
-            s.skip_until(/\w+/)
-          end
-        # Emit lone attributes
-        elsif token = s.scan(/\w+/)
-          tokens << token
-        # Stop at the end of the tag
-        elsif s.scan(/>/)
-          s.terminate
-        else
-          s.getch
-        end
-      end
-      tokens
-    end
   end
 end

data/lib/linguist/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Linguist
-  VERSION = "5.3.1"
+  VERSION = "5.3.2"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: github-linguist
 version: !ruby/object:Gem::Version
-  version: 5.3.1
+  version: 5.3.2
 platform: ruby
 authors:
 - GitHub
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-10-17 00:00:00.000000000 Z
+date: 2017-10-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: charlock_holmes
@@ -80,6 +80,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '5.0'
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.9'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.9'
 - !ruby/object:Gem::Dependency
   name: mocha
   requirement: !ruby/object:Gem::Requirement
@@ -199,12 +213,19 @@ email:
 executables:
 - linguist
 - git-linguist
-extensions: []
+extensions:
+- ext/linguist/extconf.rb
 extra_rdoc_files: []
 files:
 - LICENSE
 - bin/git-linguist
 - bin/linguist
+- ext/linguist/extconf.rb
+- ext/linguist/lex.linguist_yy.c
+- ext/linguist/lex.linguist_yy.h
+- ext/linguist/linguist.c
+- ext/linguist/linguist.h
+- ext/linguist/tokenizer.l
 - grammars/annotation.liquidhaskell.haskell.json
 - grammars/config.xcompose.json
 - grammars/file.lasso.json
@@ -651,6 +672,7 @@ files:
 - lib/linguist/languages.json
 - lib/linguist/languages.yml
 - lib/linguist/lazy_blob.rb
+- lib/linguist/linguist.bundle
 - lib/linguist/md5.rb
 - lib/linguist/popular.yml
 - lib/linguist/repository.rb