ol-github-linguist 2.4.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/linguist +42 -0
- data/lib/linguist.rb +5 -0
- data/lib/linguist/blob_helper.rb +352 -0
- data/lib/linguist/classifier.rb +123 -0
- data/lib/linguist/file_blob.rb +56 -0
- data/lib/linguist/generated.rb +162 -0
- data/lib/linguist/language.rb +483 -0
- data/lib/linguist/languages.yml +1302 -0
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/popular.yml +29 -0
- data/lib/linguist/repository.rb +95 -0
- data/lib/linguist/samples.json +31082 -0
- data/lib/linguist/samples.rb +98 -0
- data/lib/linguist/tokenizer.rb +197 -0
- data/lib/linguist/vendor.yml +98 -0
- metadata +129 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
require 'linguist/md5'
|
4
|
+
require 'linguist/classifier'
|
5
|
+
|
6
|
+
module Linguist
|
7
|
+
# Model for accessing classifier training data.
|
8
|
+
module Samples
|
9
|
+
# Path to samples root directory
|
10
|
+
ROOT = File.expand_path("../../../samples", __FILE__)
|
11
|
+
|
12
|
+
# Path for serialized samples db
|
13
|
+
PATH = File.expand_path('../samples.json', __FILE__)
|
14
|
+
|
15
|
+
# Hash of serialized samples object
|
16
|
+
if File.exist?(PATH)
|
17
|
+
DATA = YAML.load_file(PATH)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Public: Iterate over each sample.
|
21
|
+
#
|
22
|
+
# &block - Yields Sample to block
|
23
|
+
#
|
24
|
+
# Returns nothing.
|
25
|
+
def self.each(&block)
|
26
|
+
Dir.entries(ROOT).each do |category|
|
27
|
+
next if category == '.' || category == '..'
|
28
|
+
|
29
|
+
# Skip text and binary for now
|
30
|
+
# Possibly reconsider this later
|
31
|
+
next if category == 'Text' || category == 'Binary'
|
32
|
+
|
33
|
+
dirname = File.join(ROOT, category)
|
34
|
+
Dir.entries(dirname).each do |filename|
|
35
|
+
next if filename == '.' || filename == '..'
|
36
|
+
|
37
|
+
if filename == 'filenames'
|
38
|
+
Dir.entries(File.join(dirname, filename)).each do |subfilename|
|
39
|
+
next if subfilename == '.' || subfilename == '..'
|
40
|
+
|
41
|
+
yield({
|
42
|
+
:path => File.join(dirname, filename, subfilename),
|
43
|
+
:language => category,
|
44
|
+
:filename => subfilename
|
45
|
+
})
|
46
|
+
end
|
47
|
+
else
|
48
|
+
if File.extname(filename) == ""
|
49
|
+
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
|
50
|
+
end
|
51
|
+
|
52
|
+
yield({
|
53
|
+
:path => File.join(dirname, filename),
|
54
|
+
:language => category,
|
55
|
+
:extname => File.extname(filename)
|
56
|
+
})
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Public: Build Classifier from all samples.
|
65
|
+
#
|
66
|
+
# Returns trained Classifier.
|
67
|
+
def self.data
|
68
|
+
db = {}
|
69
|
+
db['extnames'] = {}
|
70
|
+
db['filenames'] = {}
|
71
|
+
|
72
|
+
each do |sample|
|
73
|
+
language_name = sample[:language]
|
74
|
+
|
75
|
+
if sample[:extname]
|
76
|
+
db['extnames'][language_name] ||= []
|
77
|
+
if !db['extnames'][language_name].include?(sample[:extname])
|
78
|
+
db['extnames'][language_name] << sample[:extname]
|
79
|
+
db['extnames'][language_name].sort!
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
if sample[:filename]
|
84
|
+
db['filenames'][language_name] ||= []
|
85
|
+
db['filenames'][language_name] << sample[:filename]
|
86
|
+
db['filenames'][language_name].sort!
|
87
|
+
end
|
88
|
+
|
89
|
+
data = File.read(sample[:path])
|
90
|
+
Classifier.train!(db, language_name, data)
|
91
|
+
end
|
92
|
+
|
93
|
+
db['md5'] = Linguist::MD5.hexdigest(db)
|
94
|
+
|
95
|
+
db
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Linguist
|
4
|
+
# Generic programming language tokenizer.
|
5
|
+
#
|
6
|
+
# Tokens are designed for use in the language bayes classifier.
|
7
|
+
# It strips any data strings or comments and preserves significant
|
8
|
+
# language symbols.
|
9
|
+
class Tokenizer
|
10
|
+
# Public: Extract tokens from data
|
11
|
+
#
|
12
|
+
# data - String to tokenize
|
13
|
+
#
|
14
|
+
# Returns Array of token Strings.
|
15
|
+
def self.tokenize(data)
|
16
|
+
new.extract_tokens(data)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Read up to 100KB
|
20
|
+
BYTE_LIMIT = 100_000
|
21
|
+
|
22
|
+
# Start state on token, ignore anything till the next newline
|
23
|
+
SINGLE_LINE_COMMENTS = [
|
24
|
+
'//', # C
|
25
|
+
'#', # Ruby
|
26
|
+
'%', # Tex
|
27
|
+
]
|
28
|
+
|
29
|
+
# Start state on opening token, ignore anything until the closing
|
30
|
+
# token is reached.
|
31
|
+
MULTI_LINE_COMMENTS = [
|
32
|
+
['/*', '*/'], # C
|
33
|
+
['<!--', '-->'], # XML
|
34
|
+
['{-', '-}'], # Haskell
|
35
|
+
['(*', '*)'] # Coq
|
36
|
+
]
|
37
|
+
|
38
|
+
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
39
|
+
"\s*#{Regexp.escape(c)} "
|
40
|
+
}.join("|"))
|
41
|
+
|
42
|
+
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
43
|
+
Regexp.escape(c[0])
|
44
|
+
}.join("|"))
|
45
|
+
|
46
|
+
# Internal: Extract generic tokens from data.
|
47
|
+
#
|
48
|
+
# data - String to scan.
|
49
|
+
#
|
50
|
+
# Examples
|
51
|
+
#
|
52
|
+
# extract_tokens("printf('Hello')")
|
53
|
+
# # => ['printf', '(', ')']
|
54
|
+
#
|
55
|
+
# Returns Array of token Strings.
|
56
|
+
def extract_tokens(data)
|
57
|
+
s = StringScanner.new(data)
|
58
|
+
|
59
|
+
tokens = []
|
60
|
+
until s.eos?
|
61
|
+
break if s.pos >= BYTE_LIMIT
|
62
|
+
|
63
|
+
if token = s.scan(/^#!.+$/)
|
64
|
+
if name = extract_shebang(token)
|
65
|
+
tokens << "SHEBANG#!#{name}"
|
66
|
+
end
|
67
|
+
|
68
|
+
# Single line comment
|
69
|
+
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
70
|
+
# tokens << token.strip
|
71
|
+
s.skip_until(/\n|\Z/)
|
72
|
+
|
73
|
+
# Multiline comments
|
74
|
+
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
75
|
+
# tokens << token
|
76
|
+
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
77
|
+
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
78
|
+
# tokens << close_token
|
79
|
+
|
80
|
+
# Skip single or double quoted strings
|
81
|
+
elsif s.scan(/"/)
|
82
|
+
if s.peek(1) == "\""
|
83
|
+
s.getch
|
84
|
+
else
|
85
|
+
s.skip_until(/[^\\]"/)
|
86
|
+
end
|
87
|
+
elsif s.scan(/'/)
|
88
|
+
if s.peek(1) == "'"
|
89
|
+
s.getch
|
90
|
+
else
|
91
|
+
s.skip_until(/[^\\]'/)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Skip number literals
|
95
|
+
elsif s.scan(/(0x)?\d(\d|\.)*/)
|
96
|
+
|
97
|
+
# SGML style brackets
|
98
|
+
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
99
|
+
extract_sgml_tokens(token).each { |t| tokens << t }
|
100
|
+
|
101
|
+
# Common programming punctuation
|
102
|
+
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
103
|
+
tokens << token
|
104
|
+
|
105
|
+
# Regular token
|
106
|
+
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
107
|
+
tokens << token
|
108
|
+
|
109
|
+
# Common operators
|
110
|
+
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
111
|
+
tokens << token
|
112
|
+
|
113
|
+
else
|
114
|
+
s.getch
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
tokens
|
119
|
+
end
|
120
|
+
|
121
|
+
# Internal: Extract normalized shebang command token.
|
122
|
+
#
|
123
|
+
# Examples
|
124
|
+
#
|
125
|
+
# extract_shebang("#!/usr/bin/ruby")
|
126
|
+
# # => "ruby"
|
127
|
+
#
|
128
|
+
# extract_shebang("#!/usr/bin/env node")
|
129
|
+
# # => "node"
|
130
|
+
#
|
131
|
+
# Returns String token or nil it couldn't be parsed.
|
132
|
+
def extract_shebang(data)
|
133
|
+
s = StringScanner.new(data)
|
134
|
+
|
135
|
+
if path = s.scan(/^#!\s*\S+/)
|
136
|
+
script = path.split('/').last
|
137
|
+
if script == 'env'
|
138
|
+
s.scan(/\s+/)
|
139
|
+
script = s.scan(/\S+/)
|
140
|
+
end
|
141
|
+
script = script[/[^\d]+/, 0] if script
|
142
|
+
return script
|
143
|
+
end
|
144
|
+
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
|
148
|
+
# Internal: Extract tokens from inside SGML tag.
|
149
|
+
#
|
150
|
+
# data - SGML tag String.
|
151
|
+
#
|
152
|
+
# Examples
|
153
|
+
#
|
154
|
+
# extract_sgml_tokens("<a href='' class=foo>")
|
155
|
+
# # => ["<a>", "href="]
|
156
|
+
#
|
157
|
+
# Returns Array of token Strings.
|
158
|
+
def extract_sgml_tokens(data)
|
159
|
+
s = StringScanner.new(data)
|
160
|
+
|
161
|
+
tokens = []
|
162
|
+
|
163
|
+
until s.eos?
|
164
|
+
# Emit start token
|
165
|
+
if token = s.scan(/<\/?[^\s>]+/)
|
166
|
+
tokens << "#{token}>"
|
167
|
+
|
168
|
+
# Emit attributes with trailing =
|
169
|
+
elsif token = s.scan(/\w+=/)
|
170
|
+
tokens << token
|
171
|
+
|
172
|
+
# Then skip over attribute value
|
173
|
+
if s.scan(/"/)
|
174
|
+
s.skip_until(/[^\\]"/)
|
175
|
+
elsif s.scan(/'/)
|
176
|
+
s.skip_until(/[^\\]'/)
|
177
|
+
else
|
178
|
+
s.skip_until(/\w+/)
|
179
|
+
end
|
180
|
+
|
181
|
+
# Emit lone attributes
|
182
|
+
elsif token = s.scan(/\w+/)
|
183
|
+
tokens << token
|
184
|
+
|
185
|
+
# Stop at the end of the tag
|
186
|
+
elsif s.scan(/>/)
|
187
|
+
s.terminate
|
188
|
+
|
189
|
+
else
|
190
|
+
s.getch
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
tokens
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# Vendored files and directories are excluded from language
|
2
|
+
# statistics.
|
3
|
+
#
|
4
|
+
# Lines in this file are Regexps that are matched against the file
|
5
|
+
# pathname.
|
6
|
+
#
|
7
|
+
# Please add additional test coverage to
|
8
|
+
# `test/test_blob.rb#test_vendored` if you make any changes.
|
9
|
+
|
10
|
+
## Vendor Conventions ##
|
11
|
+
|
12
|
+
# Caches
|
13
|
+
- cache/
|
14
|
+
|
15
|
+
# C deps
|
16
|
+
# https://github.com/joyent/node
|
17
|
+
- ^deps/
|
18
|
+
- ^tools/
|
19
|
+
|
20
|
+
# Node depedencies
|
21
|
+
- node_modules/
|
22
|
+
|
23
|
+
# Vendored depedencies
|
24
|
+
- vendor/
|
25
|
+
|
26
|
+
# Debian packaging
|
27
|
+
- ^debian/
|
28
|
+
|
29
|
+
## Commonly Bundled JavaScript frameworks ##
|
30
|
+
|
31
|
+
# jQuery
|
32
|
+
- (^|/)jquery([^.]*)(\.min)?\.js$
|
33
|
+
- (^|/)jquery\-\d\.\d(\.\d)?(\.min)?\.js$
|
34
|
+
|
35
|
+
# Prototype
|
36
|
+
- (^|/)prototype(.*)\.js$
|
37
|
+
- (^|/)effects\.js$
|
38
|
+
- (^|/)controls\.js$
|
39
|
+
- (^|/)dragdrop\.js$
|
40
|
+
|
41
|
+
# MooTools
|
42
|
+
- (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
|
43
|
+
|
44
|
+
# Dojo
|
45
|
+
- (^|/)dojo\.js$
|
46
|
+
|
47
|
+
# MochiKit
|
48
|
+
- (^|/)MochiKit\.js$
|
49
|
+
|
50
|
+
# YUI
|
51
|
+
- (^|/)yahoo-([^.]*)\.js$
|
52
|
+
- (^|/)yui([^.]*)\.js$
|
53
|
+
|
54
|
+
# LESS css
|
55
|
+
- (^|/)less([^.]*)(\.min)?\.js$
|
56
|
+
- (^|/)less\-\d+\.\d+\.\d+(\.min)?\.js$
|
57
|
+
|
58
|
+
# WYS editors
|
59
|
+
- (^|/)ckeditor\.js$
|
60
|
+
- (^|/)tiny_mce([^.]*)\.js$
|
61
|
+
- (^|/)tiny_mce/(langs|plugins|themes|utils)
|
62
|
+
|
63
|
+
# MathJax
|
64
|
+
- (^|/)MathJax/
|
65
|
+
|
66
|
+
## Python ##
|
67
|
+
|
68
|
+
# Fabric
|
69
|
+
- ^fabfile\.py$
|
70
|
+
|
71
|
+
# WAF
|
72
|
+
- ^waf$
|
73
|
+
|
74
|
+
|
75
|
+
## Obj-C ##
|
76
|
+
|
77
|
+
# Sparkle
|
78
|
+
- (^|/)Sparkle/
|
79
|
+
|
80
|
+
## .NET ##
|
81
|
+
|
82
|
+
# Visual Studio IntelliSense
|
83
|
+
- -vsdoc\.js$
|
84
|
+
|
85
|
+
# jQuery validation plugin (MS bundles this with asp.net mvc)
|
86
|
+
- (^|/)jquery([^.]*)\.validate(\.min)?\.js$
|
87
|
+
|
88
|
+
# Microsoft Ajax
|
89
|
+
- (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
|
90
|
+
|
91
|
+
# NuGet
|
92
|
+
- ^[Pp]ackages/
|
93
|
+
|
94
|
+
# ExtJS
|
95
|
+
- (^|/)extjs/
|
96
|
+
|
97
|
+
# Samples folders
|
98
|
+
- ^[Ss]amples/
|
metadata
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ol-github-linguist
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.4.2.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- GitHub/OpenLogic
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-01-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mime-types
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.19'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.19'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mocha
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: json
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: yajl-ruby
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
executables:
|
86
|
+
- linguist
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- bin/linguist
|
91
|
+
- lib/linguist.rb
|
92
|
+
- lib/linguist/blob_helper.rb
|
93
|
+
- lib/linguist/classifier.rb
|
94
|
+
- lib/linguist/file_blob.rb
|
95
|
+
- lib/linguist/generated.rb
|
96
|
+
- lib/linguist/language.rb
|
97
|
+
- lib/linguist/languages.yml
|
98
|
+
- lib/linguist/md5.rb
|
99
|
+
- lib/linguist/popular.yml
|
100
|
+
- lib/linguist/repository.rb
|
101
|
+
- lib/linguist/samples.json
|
102
|
+
- lib/linguist/samples.rb
|
103
|
+
- lib/linguist/tokenizer.rb
|
104
|
+
- lib/linguist/vendor.yml
|
105
|
+
homepage: https://github.com/openlogic/ol-github-linguist
|
106
|
+
licenses: []
|
107
|
+
metadata: {}
|
108
|
+
post_install_message:
|
109
|
+
rdoc_options: []
|
110
|
+
require_paths:
|
111
|
+
- lib
|
112
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
122
|
+
requirements: []
|
123
|
+
rubyforge_project:
|
124
|
+
rubygems_version: 2.2.2
|
125
|
+
signing_key:
|
126
|
+
specification_version: 4
|
127
|
+
summary: GitHub Language detection (special OL dub)
|
128
|
+
test_files: []
|
129
|
+
has_rdoc:
|