gitlab-linguist 2.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ require 'yaml'
2
+
3
+ require 'linguist/md5'
4
+ require 'linguist/classifier'
5
+
6
+ module Linguist
7
+ # Model for accessing classifier training data.
8
+ module Samples
9
+ # Path to samples root directory
10
+ ROOT = File.expand_path("../../../samples", __FILE__)
11
+
12
+ # Path for serialized samples db
13
+ PATH = File.expand_path('../samples.json', __FILE__)
14
+
15
+ # Hash of serialized samples object
16
+ if File.exist?(PATH)
17
+ DATA = YAML.load_file(PATH)
18
+ end
19
+
20
+ # Public: Iterate over each sample.
21
+ #
22
+ # &block - Yields Sample to block
23
+ #
24
+ # Returns nothing.
25
+ def self.each(&block)
26
+ Dir.entries(ROOT).each do |category|
27
+ next if category == '.' || category == '..'
28
+
29
+ # Skip text and binary for now
30
+ # Possibly reconsider this later
31
+ next if category == 'Text' || category == 'Binary'
32
+
33
+ dirname = File.join(ROOT, category)
34
+ Dir.entries(dirname).each do |filename|
35
+ next if filename == '.' || filename == '..'
36
+
37
+ if filename == 'filenames'
38
+ Dir.entries(File.join(dirname, filename)).each do |subfilename|
39
+ next if subfilename == '.' || subfilename == '..'
40
+
41
+ yield({
42
+ :path => File.join(dirname, filename, subfilename),
43
+ :language => category,
44
+ :filename => subfilename
45
+ })
46
+ end
47
+ else
48
+ if File.extname(filename) == ""
49
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
50
+ end
51
+
52
+ yield({
53
+ :path => File.join(dirname, filename),
54
+ :language => category,
55
+ :extname => File.extname(filename)
56
+ })
57
+ end
58
+ end
59
+ end
60
+
61
+ nil
62
+ end
63
+
64
+ # Public: Build Classifier from all samples.
65
+ #
66
+ # Returns trained Classifier.
67
+ def self.data
68
+ db = {}
69
+ db['extnames'] = {}
70
+ db['filenames'] = {}
71
+
72
+ each do |sample|
73
+ language_name = sample[:language]
74
+
75
+ if sample[:extname]
76
+ db['extnames'][language_name] ||= []
77
+ if !db['extnames'][language_name].include?(sample[:extname])
78
+ db['extnames'][language_name] << sample[:extname]
79
+ db['extnames'][language_name].sort!
80
+ end
81
+ end
82
+
83
+ if sample[:filename]
84
+ db['filenames'][language_name] ||= []
85
+ db['filenames'][language_name] << sample[:filename]
86
+ db['filenames'][language_name].sort!
87
+ end
88
+
89
+ data = File.read(sample[:path])
90
+ Classifier.train!(db, language_name, data)
91
+ end
92
+
93
+ db['md5'] = Linguist::MD5.hexdigest(db)
94
+
95
+ db
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,198 @@
1
+ require 'strscan'
2
+
3
+ module Linguist
4
+ # Generic programming language tokenizer.
5
+ #
6
+ # Tokens are designed for use in the language bayes classifier.
7
+ # It strips any data strings or comments and preserves significant
8
+ # language symbols.
9
+ class Tokenizer
10
+ # Public: Extract tokens from data
11
+ #
12
+ # data - String to tokenize
13
+ #
14
+ # Returns Array of token Strings.
15
+ def self.tokenize(data)
16
+ new.extract_tokens(data)
17
+ end
18
+
19
+ # Read up to 100KB
20
+ BYTE_LIMIT = 100_000
21
+
22
+ # Start state on token, ignore anything till the next newline
23
+ SINGLE_LINE_COMMENTS = [
24
+ '//', # C
25
+ '#', # Ruby
26
+ '%', # Tex
27
+ ]
28
+
29
+ # Start state on opening token, ignore anything until the closing
30
+ # token is reached.
31
+ MULTI_LINE_COMMENTS = [
32
+ ['/*', '*/'], # C
33
+ ['<!--', '-->'], # XML
34
+ ['{-', '-}'], # Haskell
35
+ ['(*', '*)'], # Coq
36
+ ['"""', '"""'] # Python
37
+ ]
38
+
39
+ START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
40
+ "\s*#{Regexp.escape(c)} "
41
+ }.join("|"))
42
+
43
+ START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
44
+ Regexp.escape(c[0])
45
+ }.join("|"))
46
+
47
+ # Internal: Extract generic tokens from data.
48
+ #
49
+ # data - String to scan.
50
+ #
51
+ # Examples
52
+ #
53
+ # extract_tokens("printf('Hello')")
54
+ # # => ['printf', '(', ')']
55
+ #
56
+ # Returns Array of token Strings.
57
+ def extract_tokens(data)
58
+ s = StringScanner.new(data)
59
+
60
+ tokens = []
61
+ until s.eos?
62
+ break if s.pos >= BYTE_LIMIT
63
+
64
+ if token = s.scan(/^#!.+$/)
65
+ if name = extract_shebang(token)
66
+ tokens << "SHEBANG#!#{name}"
67
+ end
68
+
69
+ # Single line comment
70
+ elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
71
+ # tokens << token.strip
72
+ s.skip_until(/\n|\Z/)
73
+
74
+ # Multiline comments
75
+ elsif token = s.scan(START_MULTI_LINE_COMMENT)
76
+ # tokens << token
77
+ close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
78
+ s.skip_until(Regexp.compile(Regexp.escape(close_token)))
79
+ # tokens << close_token
80
+
81
+ # Skip single or double quoted strings
82
+ elsif s.scan(/"/)
83
+ if s.peek(1) == "\""
84
+ s.getch
85
+ else
86
+ s.skip_until(/[^\\]"/)
87
+ end
88
+ elsif s.scan(/'/)
89
+ if s.peek(1) == "'"
90
+ s.getch
91
+ else
92
+ s.skip_until(/[^\\]'/)
93
+ end
94
+
95
+ # Skip number literals
96
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
97
+
98
+ # SGML style brackets
99
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
100
+ extract_sgml_tokens(token).each { |t| tokens << t }
101
+
102
+ # Common programming punctuation
103
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
104
+ tokens << token
105
+
106
+ # Regular token
107
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
108
+ tokens << token
109
+
110
+ # Common operators
111
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
112
+ tokens << token
113
+
114
+ else
115
+ s.getch
116
+ end
117
+ end
118
+
119
+ tokens
120
+ end
121
+
122
+ # Internal: Extract normalized shebang command token.
123
+ #
124
+ # Examples
125
+ #
126
+ # extract_shebang("#!/usr/bin/ruby")
127
+ # # => "ruby"
128
+ #
129
+ # extract_shebang("#!/usr/bin/env node")
130
+ # # => "node"
131
+ #
132
+ # Returns String token or nil it couldn't be parsed.
133
+ def extract_shebang(data)
134
+ s = StringScanner.new(data)
135
+
136
+ if path = s.scan(/^#!\s*\S+/)
137
+ script = path.split('/').last
138
+ if script == 'env'
139
+ s.scan(/\s+/)
140
+ script = s.scan(/\S+/)
141
+ end
142
+ script = script[/[^\d]+/, 0] if script
143
+ return script
144
+ end
145
+
146
+ nil
147
+ end
148
+
149
+ # Internal: Extract tokens from inside SGML tag.
150
+ #
151
+ # data - SGML tag String.
152
+ #
153
+ # Examples
154
+ #
155
+ # extract_sgml_tokens("<a href='' class=foo>")
156
+ # # => ["<a>", "href="]
157
+ #
158
+ # Returns Array of token Strings.
159
+ def extract_sgml_tokens(data)
160
+ s = StringScanner.new(data)
161
+
162
+ tokens = []
163
+
164
+ until s.eos?
165
+ # Emit start token
166
+ if token = s.scan(/<\/?[^\s>]+/)
167
+ tokens << "#{token}>"
168
+
169
+ # Emit attributes with trailing =
170
+ elsif token = s.scan(/\w+=/)
171
+ tokens << token
172
+
173
+ # Then skip over attribute value
174
+ if s.scan(/"/)
175
+ s.skip_until(/[^\\]"/)
176
+ elsif s.scan(/'/)
177
+ s.skip_until(/[^\\]'/)
178
+ else
179
+ s.skip_until(/\w+/)
180
+ end
181
+
182
+ # Emit lone attributes
183
+ elsif token = s.scan(/\w+/)
184
+ tokens << token
185
+
186
+ # Stop at the end of the tag
187
+ elsif s.scan(/>/)
188
+ s.terminate
189
+
190
+ else
191
+ s.getch
192
+ end
193
+ end
194
+
195
+ tokens
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,129 @@
1
+ # Vendored files and directories are excluded from language
2
+ # statistics.
3
+ #
4
+ # Lines in this file are Regexps that are matched against the file
5
+ # pathname.
6
+ #
7
+ # Please add additional test coverage to
8
+ # `test/test_blob.rb#test_vendored` if you make any changes.
9
+
10
+ ## Vendor Conventions ##
11
+
12
+ # Caches
13
+ - cache/
14
+
15
+ # C deps
16
+ # https://github.com/joyent/node
17
+ - ^deps/
18
+ - ^tools/
19
+ - (^|/)configure$
20
+ - (^|/)configure.ac$
21
+ - (^|/)config.guess$
22
+ - (^|/)config.sub$
23
+
24
+ # Node dependencies
25
+ - node_modules/
26
+
27
+ # Erlang bundles
28
+ - ^rebar$
29
+
30
+ # Vendored dependencies
31
+ - vendor/
32
+
33
+ # Debian packaging
34
+ - ^debian/
35
+
36
+ ## Commonly Bundled JavaScript frameworks ##
37
+
38
+ # jQuery
39
+ - (^|/)jquery([^.]*)(\.min)?\.js$
40
+ - (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
41
+
42
+ # jQuery UI
43
+ - (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
44
+ - (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
45
+
46
+ # Prototype
47
+ - (^|/)prototype(.*)\.js$
48
+ - (^|/)effects\.js$
49
+ - (^|/)controls\.js$
50
+ - (^|/)dragdrop\.js$
51
+
52
+ # MooTools
53
+ - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
54
+
55
+ # Dojo
56
+ - (^|/)dojo\.js$
57
+
58
+ # MochiKit
59
+ - (^|/)MochiKit\.js$
60
+
61
+ # YUI
62
+ - (^|/)yahoo-([^.]*)\.js$
63
+ - (^|/)yui([^.]*)\.js$
64
+
65
+ # WYS editors
66
+ - (^|/)ckeditor\.js$
67
+ - (^|/)tiny_mce([^.]*)\.js$
68
+ - (^|/)tiny_mce/(langs|plugins|themes|utils)
69
+
70
+ # MathJax
71
+ - (^|/)MathJax/
72
+
73
+ # SyntaxHighlighter - http://alexgorbatchev.com/
74
+ - (^|/)shBrush([^.]*)\.js$
75
+ - (^|/)shCore\.js$
76
+ - (^|/)shLegacy\.js$
77
+
78
+ ## Python ##
79
+
80
+ # django
81
+ - (^|/)admin_media/
82
+
83
+ # Fabric
84
+ - ^fabfile\.py$
85
+
86
+ # WAF
87
+ - ^waf$
88
+
89
+
90
+ ## Obj-C ##
91
+
92
+ # Sparkle
93
+ - (^|/)Sparkle/
94
+
95
+ ## .NET ##
96
+
97
+ # Visual Studio IntelliSense
98
+ - -vsdoc\.js$
99
+
100
+ # jQuery validation plugin (MS bundles this with asp.net mvc)
101
+ - (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
102
+ - (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
103
+
104
+ # Microsoft Ajax
105
+ - (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
106
+
107
+ # NuGet
108
+ - ^[Pp]ackages/
109
+
110
+ # ExtJS
111
+ - (^|/)extjs/
112
+
113
+ # Samples folders
114
+ - ^[Ss]amples/
115
+
116
+ # LICENSE, README, git config files
117
+ - ^COPYING$
118
+ - ^LICENSE$
119
+ - gitattributes$
120
+ - gitignore$
121
+ - gitmodules$
122
+ - ^README$
123
+ - ^readme$
124
+
125
+ # Test fixtures
126
+ - ^[Tt]est/fixtures/
127
+
128
+ # .DS_Store's
129
+ - .[Dd][Ss]_[Ss]tore$
metadata ADDED
@@ -0,0 +1,171 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gitlab-linguist
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.9.5
5
+ platform: ruby
6
+ authors:
7
+ - GitHub
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-11-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: charlock_holmes
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.6
27
+ - !ruby/object:Gem::Dependency
28
+ name: escape_utils
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.19'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.19'
55
+ - !ruby/object:Gem::Dependency
56
+ name: gitlab-pygments.rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.5.4
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.5.4
69
+ - !ruby/object:Gem::Dependency
70
+ name: mocha
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: json
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: yajl-ruby
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ executables:
128
+ - linguist
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - lib/linguist/language.rb
133
+ - lib/linguist/vendor.yml
134
+ - lib/linguist/repository.rb
135
+ - lib/linguist/samples.rb
136
+ - lib/linguist/samples.json
137
+ - lib/linguist/generated.rb
138
+ - lib/linguist/md5.rb
139
+ - lib/linguist/popular.yml
140
+ - lib/linguist/classifier.rb
141
+ - lib/linguist/tokenizer.rb
142
+ - lib/linguist/blob_helper.rb
143
+ - lib/linguist/languages.yml
144
+ - lib/linguist/file_blob.rb
145
+ - lib/linguist.rb
146
+ - bin/linguist
147
+ homepage: https://github.com/github/linguist
148
+ licenses:
149
+ - MIT
150
+ metadata: {}
151
+ post_install_message:
152
+ rdoc_options: []
153
+ require_paths:
154
+ - lib
155
+ required_ruby_version: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ required_rubygems_version: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - '>='
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ requirements: []
166
+ rubyforge_project:
167
+ rubygems_version: 2.0.3
168
+ signing_key:
169
+ specification_version: 4
170
+ summary: GitHub Language detection
171
+ test_files: []