github-linguist 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/linguist/blob_helper.rb +10 -2
- data/lib/linguist/classifier.rb +183 -0
- data/lib/linguist/classifier.yml +19013 -0
- data/lib/linguist/language.rb +1 -1
- data/lib/linguist/sample.rb +74 -0
- data/lib/linguist/tokenizer.rb +157 -0
- metadata +22 -4
data/lib/linguist/language.rb
CHANGED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'linguist/classifier'
|
2
|
+
require 'linguist/language'
|
3
|
+
|
4
|
+
module Linguist
|
5
|
+
# Model for accessing classifier training data.
|
6
|
+
class Sample
|
7
|
+
# Samples live in test/ for now, we'll eventually move them out
|
8
|
+
PATH = File.expand_path("../../../test/fixtures", __FILE__)
|
9
|
+
|
10
|
+
# Public: Iterate over each Sample.
|
11
|
+
#
|
12
|
+
# &block - Yields Sample to block
|
13
|
+
#
|
14
|
+
# Returns nothing.
|
15
|
+
def self.each(&block)
|
16
|
+
Dir.entries(PATH).each do |category|
|
17
|
+
next if category == '.' || category == '..'
|
18
|
+
|
19
|
+
# Skip text and binary for now
|
20
|
+
# Possibly reconsider this later
|
21
|
+
next if category == 'text' || category == 'binary'
|
22
|
+
|
23
|
+
# Map directory name to a Language alias
|
24
|
+
language = Linguist::Language.find_by_alias(category)
|
25
|
+
raise "No language for #{category.inspect}" unless language
|
26
|
+
|
27
|
+
dirname = File.join(PATH, category)
|
28
|
+
Dir.entries(dirname).each do |filename|
|
29
|
+
next if filename == '.' || filename == '..'
|
30
|
+
yield new(File.join(dirname, filename), language)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
# Public: Build Classifier from all samples.
|
38
|
+
#
|
39
|
+
# Returns trained Classifier.
|
40
|
+
def self.classifier
|
41
|
+
classifier = Classifier.new
|
42
|
+
each { |sample| classifier.train(sample.language, sample.data) }
|
43
|
+
classifier.gc
|
44
|
+
end
|
45
|
+
|
46
|
+
# Internal: Initialize Sample.
|
47
|
+
#
|
48
|
+
# Samples should be initialized by Sample.each.
|
49
|
+
#
|
50
|
+
# path - String full path to file.
|
51
|
+
# language - Language of sample.
|
52
|
+
def initialize(path, language)
|
53
|
+
@path = path
|
54
|
+
@language = language
|
55
|
+
end
|
56
|
+
|
57
|
+
# Public: Get full path to file.
|
58
|
+
#
|
59
|
+
# Returns String.
|
60
|
+
attr_reader :path
|
61
|
+
|
62
|
+
# Public: Get sample language.
|
63
|
+
#
|
64
|
+
# Returns Language.
|
65
|
+
attr_reader :language
|
66
|
+
|
67
|
+
# Public: Read file contents.
|
68
|
+
#
|
69
|
+
# Returns String.
|
70
|
+
def data
|
71
|
+
File.read(path)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Linguist
|
2
|
+
# Generic programming language tokenizer.
|
3
|
+
#
|
4
|
+
# Tokens are designed for use in the language bayes classifier.
|
5
|
+
# It strips any data strings or comments and preserves significant
|
6
|
+
# language symbols.
|
7
|
+
class Tokenizer
|
8
|
+
# Public: Initialize a Tokenizer.
|
9
|
+
#
|
10
|
+
# data - String data to scan.
|
11
|
+
def initialize(data)
|
12
|
+
@data = data
|
13
|
+
end
|
14
|
+
|
15
|
+
# Public: Get source data.
|
16
|
+
#
|
17
|
+
# Returns String.
|
18
|
+
attr_reader :data
|
19
|
+
|
20
|
+
# Public: Extract tokens from data.
|
21
|
+
#
|
22
|
+
# Returns Array of token Strings.
|
23
|
+
def tokens
|
24
|
+
extract_tokens(data)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Internal: Extract generic tokens from data.
|
28
|
+
#
|
29
|
+
# data - String to scan.
|
30
|
+
#
|
31
|
+
# Examples
|
32
|
+
#
|
33
|
+
# extract_tokens("printf('Hello')")
|
34
|
+
# # => ['printf', '(', ')']
|
35
|
+
#
|
36
|
+
# Returns Array of token Strings.
|
37
|
+
def extract_tokens(data)
|
38
|
+
s = StringScanner.new(data)
|
39
|
+
|
40
|
+
tokens = []
|
41
|
+
until s.eos?
|
42
|
+
# Ruby single line comment
|
43
|
+
if token = s.scan(/# /)
|
44
|
+
tokens << "#"
|
45
|
+
s.skip_until(/\n|\Z/)
|
46
|
+
|
47
|
+
# C style single line comment
|
48
|
+
elsif token = s.scan(/\/\/ /)
|
49
|
+
tokens << "//"
|
50
|
+
s.skip_until(/\n|\Z/)
|
51
|
+
|
52
|
+
# Leading Tex or Matlab comments
|
53
|
+
elsif token = s.scan(/\n%/)
|
54
|
+
tokens << "%"
|
55
|
+
s.skip_until(/\n|\Z/)
|
56
|
+
|
57
|
+
# C multiline comments
|
58
|
+
elsif token = s.scan(/\/\*/)
|
59
|
+
tokens << "/*"
|
60
|
+
s.skip_until(/\*\//)
|
61
|
+
tokens << "*/"
|
62
|
+
|
63
|
+
# Haskell multiline comments
|
64
|
+
elsif token = s.scan(/\{-/)
|
65
|
+
tokens << "{-"
|
66
|
+
s.skip_until(/-\}/)
|
67
|
+
tokens << "-}"
|
68
|
+
|
69
|
+
# XML multiline comments
|
70
|
+
elsif token = s.scan(/<!--/)
|
71
|
+
tokens << "<!--"
|
72
|
+
s.skip_until(/-->/)
|
73
|
+
tokens << "-->"
|
74
|
+
|
75
|
+
# Skip single or double quoted strings
|
76
|
+
elsif s.scan(/"/)
|
77
|
+
s.skip_until(/[^\\]"/)
|
78
|
+
elsif s.scan(/'/)
|
79
|
+
s.skip_until(/[^\\]'/)
|
80
|
+
|
81
|
+
# Skip number literals
|
82
|
+
elsif s.scan(/(0x)?\d+/)
|
83
|
+
|
84
|
+
# SGML style brackets
|
85
|
+
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
86
|
+
extract_sgml_tokens(token).each { |t| tokens << t }
|
87
|
+
|
88
|
+
# Common programming punctuation
|
89
|
+
elsif token = s.scan(/;|\{|\}|\(|\)/)
|
90
|
+
tokens << token
|
91
|
+
|
92
|
+
# Regular token
|
93
|
+
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
94
|
+
tokens << token
|
95
|
+
|
96
|
+
# Common operators
|
97
|
+
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
98
|
+
tokens << token
|
99
|
+
|
100
|
+
else
|
101
|
+
s.getch
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
tokens
|
106
|
+
end
|
107
|
+
|
108
|
+
# Internal: Extract tokens from inside SGML tag.
|
109
|
+
#
|
110
|
+
# data - SGML tag String.
|
111
|
+
#
|
112
|
+
# Examples
|
113
|
+
#
|
114
|
+
# extract_sgml_tokens("<a href='' class=foo>")
|
115
|
+
# # => ["<a>", "href="]
|
116
|
+
#
|
117
|
+
# Returns Array of token Strings.
|
118
|
+
def extract_sgml_tokens(data)
|
119
|
+
s = StringScanner.new(data)
|
120
|
+
|
121
|
+
tokens = []
|
122
|
+
|
123
|
+
until s.eos?
|
124
|
+
# Emit start token
|
125
|
+
if token = s.scan(/<\/?[^\s>]+/)
|
126
|
+
tokens << "#{token}>"
|
127
|
+
|
128
|
+
# Emit attributes with trailing =
|
129
|
+
elsif token = s.scan(/\w+=/)
|
130
|
+
tokens << token
|
131
|
+
|
132
|
+
# Then skip over attribute value
|
133
|
+
if s.scan(/"/)
|
134
|
+
s.skip_until(/[^\\]"/)
|
135
|
+
elsif s.scan(/'/)
|
136
|
+
s.skip_until(/[^\\]'/)
|
137
|
+
else
|
138
|
+
s.skip_until(/\w+/)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Emit lone attributes
|
142
|
+
elsif token = s.scan(/\w+/)
|
143
|
+
tokens << token
|
144
|
+
|
145
|
+
# Stop at the end of the tag
|
146
|
+
elsif s.scan(/>/)
|
147
|
+
s.terminate
|
148
|
+
|
149
|
+
else
|
150
|
+
s.getch
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
tokens
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github-linguist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
|
-
-
|
7
|
+
- 2
|
8
8
|
- 0
|
9
9
|
- 0
|
10
|
-
version:
|
10
|
+
version: 2.0.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- GitHub
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
type: :runtime
|
82
82
|
version_requirements: *id004
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: json
|
85
85
|
prerelease: false
|
86
86
|
requirement: &id005 !ruby/object:Gem::Requirement
|
87
87
|
none: false
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
version: "0"
|
95
95
|
type: :development
|
96
96
|
version_requirements: *id005
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rake
|
99
|
+
prerelease: false
|
100
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 3
|
106
|
+
segments:
|
107
|
+
- 0
|
108
|
+
version: "0"
|
109
|
+
type: :development
|
110
|
+
version_requirements: *id006
|
97
111
|
description:
|
98
112
|
email:
|
99
113
|
executables:
|
@@ -104,6 +118,8 @@ extra_rdoc_files: []
|
|
104
118
|
|
105
119
|
files:
|
106
120
|
- lib/linguist/blob_helper.rb
|
121
|
+
- lib/linguist/classifier.rb
|
122
|
+
- lib/linguist/classifier.yml
|
107
123
|
- lib/linguist/file_blob.rb
|
108
124
|
- lib/linguist/language.rb
|
109
125
|
- lib/linguist/languages.yml
|
@@ -112,6 +128,8 @@ files:
|
|
112
128
|
- lib/linguist/pathname.rb
|
113
129
|
- lib/linguist/popular.yml
|
114
130
|
- lib/linguist/repository.rb
|
131
|
+
- lib/linguist/sample.rb
|
132
|
+
- lib/linguist/tokenizer.rb
|
115
133
|
- lib/linguist/vendor.yml
|
116
134
|
- lib/linguist.rb
|
117
135
|
- bin/linguist
|