github-linguist 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,6 +45,10 @@ module Linguist
45
45
  })
46
46
  end
47
47
  else
48
+ if File.extname(filename) == ""
49
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
50
+ end
51
+
48
52
  yield({
49
53
  :path => File.join(dirname, filename),
50
54
  :language => category,
@@ -68,18 +72,16 @@ module Linguist
68
72
  each do |sample|
69
73
  language_name = sample[:language]
70
74
 
71
- # TODO: For now skip empty extnames
72
- if sample[:extname] && sample[:extname] != ""
75
+ if sample[:extname]
73
76
  db['extnames'][language_name] ||= []
74
77
  if !db['extnames'][language_name].include?(sample[:extname])
75
78
  db['extnames'][language_name] << sample[:extname]
76
79
  end
77
80
  end
78
81
 
79
- # TODO: For now skip empty extnames
80
- if fn = sample[:filename]
82
+ if sample[:filename]
81
83
  db['filenames'][language_name] ||= []
82
- db['filenames'][language_name] << fn
84
+ db['filenames'][language_name] << sample[:filename]
83
85
  end
84
86
 
85
87
  data = File.read(sample[:path])
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module Linguist
2
4
  # Generic programming language tokenizer.
3
5
  #
@@ -50,8 +52,13 @@ module Linguist
50
52
 
51
53
  tokens = []
52
54
  until s.eos?
55
+ if token = s.scan(/^#!.+$/)
56
+ if name = extract_shebang(token)
57
+ tokens << "SHEBANG#!#{name}"
58
+ end
59
+
53
60
  # Single line comment
54
- if token = s.scan(START_SINGLE_LINE_COMMENT)
61
+ elsif token = s.scan(START_SINGLE_LINE_COMMENT)
55
62
  tokens << token.strip
56
63
  s.skip_until(/\n|\Z/)
57
64
 
@@ -64,19 +71,27 @@ module Linguist
64
71
 
65
72
  # Skip single or double quoted strings
66
73
  elsif s.scan(/"/)
67
- s.skip_until(/[^\\]"/)
74
+ if s.peek(1) == "\""
75
+ s.getch
76
+ else
77
+ s.skip_until(/[^\\]"/)
78
+ end
68
79
  elsif s.scan(/'/)
69
- s.skip_until(/[^\\]'/)
80
+ if s.peek(1) == "'"
81
+ s.getch
82
+ else
83
+ s.skip_until(/[^\\]'/)
84
+ end
70
85
 
71
86
  # Skip number literals
72
- elsif s.scan(/(0x)?\d+/)
87
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
73
88
 
74
89
  # SGML style brackets
75
90
  elsif token = s.scan(/<[^\s<>][^<>]*>/)
76
91
  extract_sgml_tokens(token).each { |t| tokens << t }
77
92
 
78
93
  # Common programming punctuation
79
- elsif token = s.scan(/;|\{|\}|\(|\)/)
94
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
80
95
  tokens << token
81
96
 
82
97
  # Regular token
@@ -95,6 +110,33 @@ module Linguist
95
110
  tokens
96
111
  end
97
112
 
113
+ # Internal: Extract normalized shebang command token.
114
+ #
115
+ # Examples
116
+ #
117
+ # extract_shebang("#!/usr/bin/ruby")
118
+ # # => "ruby"
119
+ #
120
+ # extract_shebang("#!/usr/bin/env node")
121
+ # # => "node"
122
+ #
123
+ # Returns String token or nil it couldn't be parsed.
124
+ def extract_shebang(data)
125
+ s = StringScanner.new(data)
126
+
127
+ if path = s.scan(/^#!\s*\S+/)
128
+ script = path.split('/').last
129
+ if script == 'env'
130
+ s.scan(/\s+/)
131
+ script = s.scan(/\S+/)
132
+ end
133
+ script = script[/[^\d]+/, 0]
134
+ return script
135
+ end
136
+
137
+ nil
138
+ end
139
+
98
140
  # Internal: Extract tokens from inside SGML tag.
99
141
  #
100
142
  # data - SGML tag String.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github-linguist
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.2
4
+ version: 2.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-23 00:00:00.000000000 Z
12
+ date: 2012-08-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: charlock_holmes
@@ -133,6 +133,7 @@ files:
133
133
  - lib/linguist/blob_helper.rb
134
134
  - lib/linguist/classifier.rb
135
135
  - lib/linguist/file_blob.rb
136
+ - lib/linguist/generated.rb
136
137
  - lib/linguist/language.rb
137
138
  - lib/linguist/languages.yml
138
139
  - lib/linguist/md5.rb