github-linguist 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/linguist +1 -1
- data/lib/linguist.rb +1 -1
- data/lib/linguist/blob_helper.rb +9 -194
- data/lib/linguist/classifier.rb +50 -111
- data/lib/linguist/language.rb +31 -16
- data/lib/linguist/languages.yml +110 -121
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/repository.rb +1 -1
- data/lib/linguist/samples.json +20125 -0
- data/lib/linguist/samples.rb +94 -0
- data/lib/linguist/tokenizer.rb +34 -44
- metadata +21 -5
- data/lib/linguist/classifier.yml +0 -19013
- data/lib/linguist/pathname.rb +0 -92
- data/lib/linguist/sample.rb +0 -74
data/lib/linguist/pathname.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
require 'linguist/language'
|
2
|
-
require 'linguist/mime'
|
3
|
-
require 'pygments'
|
4
|
-
|
5
|
-
module Linguist
|
6
|
-
# Similar to ::Pathname, Linguist::Pathname wraps a path string and
|
7
|
-
# provides helpful query methods. Its useful when you only have a
|
8
|
-
# filename but not a blob and need to figure out the language of the file.
|
9
|
-
class Pathname
|
10
|
-
# Public: Initialize a Pathname
|
11
|
-
#
|
12
|
-
# path - A filename String. The file may or maybe actually exist.
|
13
|
-
#
|
14
|
-
# Returns a Pathname.
|
15
|
-
def initialize(path)
|
16
|
-
@path = path
|
17
|
-
end
|
18
|
-
|
19
|
-
# Public: Get the basename of the path
|
20
|
-
#
|
21
|
-
# Examples
|
22
|
-
#
|
23
|
-
# Pathname.new('sub/dir/file.rb').basename
|
24
|
-
# # => 'file.rb'
|
25
|
-
#
|
26
|
-
# Returns a String.
|
27
|
-
def basename
|
28
|
-
File.basename(@path)
|
29
|
-
end
|
30
|
-
|
31
|
-
# Public: Get the extname of the path
|
32
|
-
#
|
33
|
-
# Examples
|
34
|
-
#
|
35
|
-
# Pathname.new('.rb').extname
|
36
|
-
# # => '.rb'
|
37
|
-
#
|
38
|
-
# Pathname.new('file.rb').extname
|
39
|
-
# # => '.rb'
|
40
|
-
#
|
41
|
-
# Returns a String.
|
42
|
-
def extname
|
43
|
-
File.extname(@path)
|
44
|
-
end
|
45
|
-
|
46
|
-
# Public: Get the language of the path
|
47
|
-
#
|
48
|
-
# The path extension name is the only heuristic used to detect the
|
49
|
-
# language name.
|
50
|
-
#
|
51
|
-
# Examples
|
52
|
-
#
|
53
|
-
# Pathname.new('file.rb').language
|
54
|
-
# # => Language['Ruby']
|
55
|
-
#
|
56
|
-
# Returns a Language or nil if none was found.
|
57
|
-
def language
|
58
|
-
@language ||= Language.find_by_filename(@path)
|
59
|
-
end
|
60
|
-
|
61
|
-
# Internal: Get the lexer of the path
|
62
|
-
#
|
63
|
-
# Returns a Lexer.
|
64
|
-
def lexer
|
65
|
-
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
66
|
-
end
|
67
|
-
|
68
|
-
# Public: Get the mime type
|
69
|
-
#
|
70
|
-
# Examples
|
71
|
-
#
|
72
|
-
# Pathname.new('index.html').mime_type
|
73
|
-
# # => 'text/html'
|
74
|
-
#
|
75
|
-
# Returns a mime type String.
|
76
|
-
def mime_type
|
77
|
-
@mime_type ||= Mime.mime_for(extname)
|
78
|
-
end
|
79
|
-
|
80
|
-
# Public: Return self as String
|
81
|
-
#
|
82
|
-
# Returns a String
|
83
|
-
def to_s
|
84
|
-
@path.dup
|
85
|
-
end
|
86
|
-
|
87
|
-
def eql?(other)
|
88
|
-
other.is_a?(self.class) && @path == other.to_s
|
89
|
-
end
|
90
|
-
alias_method :==, :eql?
|
91
|
-
end
|
92
|
-
end
|
data/lib/linguist/sample.rb
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
require 'linguist/classifier'
|
2
|
-
require 'linguist/language'
|
3
|
-
|
4
|
-
module Linguist
|
5
|
-
# Model for accessing classifier training data.
|
6
|
-
class Sample
|
7
|
-
# Samples live in test/ for now, we'll eventually move them out
|
8
|
-
PATH = File.expand_path("../../../test/fixtures", __FILE__)
|
9
|
-
|
10
|
-
# Public: Iterate over each Sample.
|
11
|
-
#
|
12
|
-
# &block - Yields Sample to block
|
13
|
-
#
|
14
|
-
# Returns nothing.
|
15
|
-
def self.each(&block)
|
16
|
-
Dir.entries(PATH).each do |category|
|
17
|
-
next if category == '.' || category == '..'
|
18
|
-
|
19
|
-
# Skip text and binary for now
|
20
|
-
# Possibly reconsider this later
|
21
|
-
next if category == 'text' || category == 'binary'
|
22
|
-
|
23
|
-
# Map directory name to a Language alias
|
24
|
-
language = Linguist::Language.find_by_alias(category)
|
25
|
-
raise "No language for #{category.inspect}" unless language
|
26
|
-
|
27
|
-
dirname = File.join(PATH, category)
|
28
|
-
Dir.entries(dirname).each do |filename|
|
29
|
-
next if filename == '.' || filename == '..'
|
30
|
-
yield new(File.join(dirname, filename), language)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
nil
|
35
|
-
end
|
36
|
-
|
37
|
-
# Public: Build Classifier from all samples.
|
38
|
-
#
|
39
|
-
# Returns trained Classifier.
|
40
|
-
def self.classifier
|
41
|
-
classifier = Classifier.new
|
42
|
-
each { |sample| classifier.train(sample.language, sample.data) }
|
43
|
-
classifier.gc
|
44
|
-
end
|
45
|
-
|
46
|
-
# Internal: Initialize Sample.
|
47
|
-
#
|
48
|
-
# Samples should be initialized by Sample.each.
|
49
|
-
#
|
50
|
-
# path - String full path to file.
|
51
|
-
# language - Language of sample.
|
52
|
-
def initialize(path, language)
|
53
|
-
@path = path
|
54
|
-
@language = language
|
55
|
-
end
|
56
|
-
|
57
|
-
# Public: Get full path to file.
|
58
|
-
#
|
59
|
-
# Returns String.
|
60
|
-
attr_reader :path
|
61
|
-
|
62
|
-
# Public: Get sample language.
|
63
|
-
#
|
64
|
-
# Returns Language.
|
65
|
-
attr_reader :language
|
66
|
-
|
67
|
-
# Public: Read file contents.
|
68
|
-
#
|
69
|
-
# Returns String.
|
70
|
-
def data
|
71
|
-
File.read(path)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|