greeb 0.2.2.1 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2e350306a429635314d151614f03ef4a0edd6887
4
- data.tar.gz: 99aa7bb04b04589adbb1f62aacf2733071d14b53
3
+ metadata.gz: 07673b32254cd2b0ab0edf0664fa59e46231dbe3
4
+ data.tar.gz: f8eaac92c0fd4d7dda99c4441117b5e2b34c5caa
5
5
  SHA512:
6
- metadata.gz: ae38f8a918d857283183bde7f003f44b88be59fa67213be58fb9267e2fb51853fba7422918fb1c133c526c401e0baabae7c83ed753ffc38f887a56055884d74e
7
- data.tar.gz: a4d66506d58d10211aa776606f022fda66c038c7c209d3df92e11bf229bf50b990e6c05351287408fe6acce69386f6e18013730563233fbecbc01ebe577471d3
6
+ metadata.gz: ebfda44f713c3dcda9df0439f073a1c075360c14cc41e99c400db8eec25f06033ba2d788b5c4b7b8715eeb5cc085e6c704f6e9bad08bfd6af7b4bc4d051a8c32
7
+ data.tar.gz: cb60f13ddad1e17a7cdbbd19add866677f92590cb7072dd5dc3cf70b80c93a31e0e857366908ff214bb0cc5b2c585415abd78338bd8479ea3150ebf58f5d2117
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010-2013 Dmitry Ustalov
1
+ Copyright (c) 2010-2014 Dmitry Ustalov
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # Greeb
2
2
  Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
3
- that is based on regular expressions. API documentation is available at
4
- <http://rubydoc.info/github/dmchk/greeb/master/frames>.
3
+ that is based on regular expressions. The API documentation is available
4
+ at <http://rubydoc.info/github/dmchk/greeb/master/frames>.
5
5
 
6
6
  ## Installation
7
7
  Add this line to your application's Gemfile:
@@ -134,12 +134,12 @@ addresses. Greeb can help you in these strings retrieval.
134
134
  ```ruby
135
135
  text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
136
136
 
137
- pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
137
+ pp Greeb::Parser.urls(text).map { |e| [e, e.slice(text)] }
138
138
  =begin
139
139
  [[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
140
140
  =end
141
141
 
142
- pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
142
+ pp Greeb::Parser.emails(text).map { |e| [e, e.slice(text)] }
143
143
  =begin
144
144
  [[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
145
145
  =end
@@ -151,7 +151,7 @@ Please don't use Greeb in spam lists development purposes.
151
151
  ```ruby
152
152
  text = 'Hello, G.L.H.F. everyone!'
153
153
 
154
- pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
154
+ pp Greeb::Parser.abbrevs(text).map { |e| [e, e.slice(text)] }
155
155
  =begin
156
156
  [[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
157
157
  =end
@@ -164,7 +164,7 @@ situations.
164
164
  ```ruby
165
165
  text = 'Our time is running out: 13:37 or 14:89.'
166
166
 
167
- pp Greeb::Parser.time(text).map { |e| [e, text[e.from...e.to]] }
167
+ pp Greeb::Parser.time(text).map { |e| [e, e.slice(text)] }
168
168
  =begin
169
169
  [[#<struct Greeb::Span from=25, to=30, type=:time>, "13:37"]]
170
170
  =end
@@ -194,6 +194,6 @@ There are several span types at the tokenization stage: `:letter`,
194
194
 
195
195
  ## Copyright
196
196
 
197
- Copyright (c) 2010-2013 [Dmitry Ustalov]. See LICENSE for details.
197
+ Copyright (c) 2010-2014 [Dmitry Ustalov]. See LICENSE for details.
198
198
 
199
- [Dmitry Ustalov]: http://eveel.ru
199
+ [Dmitry Ustalov]: http://ustalov.name/
data/bin/greeb CHANGED
@@ -1,13 +1,45 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'ostruct'
4
+ require 'optparse'
5
+
3
6
  if File.exists? File.expand_path('../../.git', __FILE__)
4
7
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
5
8
  end
6
9
 
7
10
  require 'greeb'
8
11
 
9
- text = STDIN.read.tap(&:chomp!)
12
+ options = OpenStruct.new(input: STDIN, output: STDOUT)
13
+
14
+ optparse = OptionParser.new do |opts|
15
+ opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
16
+
17
+ opts.on '-i', '--input [file]', 'Input file' do |input|
18
+ options.input = File.open(input)
19
+ at_exit { options.input.close }
20
+ end
21
+
22
+ opts.on '-o', '--output [file]', 'Output file' do |output|
23
+ options.output = File.open(output)
24
+ at_exit { options.output.close }
25
+ end
26
+
27
+ opts.on_tail '-h', '--help', 'Just display this help' do
28
+ puts opts
29
+ exit
30
+ end
31
+
32
+ opts.on_tail '-v', '--version', 'Just print the version infomation' do
33
+ puts 'Greeb %s' % Greeb::VERSION
34
+ exit
35
+ end
36
+ end
37
+
38
+ optparse.parse!
39
+
40
+ text = options.input.read.tap(&:chomp!)
10
41
 
11
42
  Greeb[text].each do |span|
12
- puts text[span.from...span.to] unless [:space, :break].include? span.type
43
+ next if [:space, :break].include? span.type
44
+ options.output.puts text[span.from...span.to]
13
45
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  require 'greeb/version'
4
2
  require 'greeb/exceptions'
5
3
  require 'greeb/span'
@@ -11,7 +11,8 @@ module Greeb::Core
11
11
 
12
12
  # Recognize e-mail addresses in the input text.
13
13
  #
14
- # @param text [String] input text.
14
+ # @param text [String] an input text.
15
+ # @param helpers [Array<Symbol>] a set of helper identifiers.
15
16
  #
16
17
  # @return [Array<Greeb::Span>] a set of tokens.
17
18
  #
@@ -27,7 +28,6 @@ module Greeb::Core
27
28
 
28
29
  alias_method :'[]', :analyze
29
30
 
30
- protected
31
31
  # Extact spans of the specified type from the input spans set.
32
32
  #
33
33
  # @param spans [Array<Greeb::Span>] input spans set.
@@ -8,20 +8,33 @@ module Greeb::Parser
8
8
  extend self
9
9
 
10
10
  # An URL pattern. Not so precise, but IDN-compatible.
11
+ #
11
12
  URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
12
13
 
13
14
  # A horrible e-mail pattern.
15
+ #
14
16
  EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
15
17
 
16
18
  # Another horrible pattern. Now for abbreviations.
19
+ #
17
20
  ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
18
21
 
19
22
  # This pattern matches anything that looks like HTML. Or not.
23
+ #
20
24
  HTML = /<(.*?)>/i
21
25
 
22
26
  # Time pattern.
27
+ #
23
28
  TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i
24
29
 
30
+ # Apostrophe pattern.
31
+ #
32
+ APOSTROPHE = /['’]/i
33
+
34
+ # Together pattern.
35
+ #
36
+ TOGETHER = [:letter, :integer, :apostrophe, :together]
37
+
25
38
  # Recognize URLs in the input text. Actually, URL is obsolete standard
26
39
  # and this code should be rewritten to use the URI concept.
27
40
  #
@@ -73,6 +86,54 @@ module Greeb::Parser
73
86
  scan(text, TIME, :time)
74
87
  end
75
88
 
89
+ # Retrieve apostrophes from the tokenized text. The algorithm may be
90
+ # more optimal.
91
+ #
92
+ # @param text [String] input text.
93
+ # @param spans [Array<Greeb::Span>] already tokenized text.
94
+ #
95
+ # @return [Array<Greeb::Span>] retrieved apostrophes.
96
+ #
97
+ def apostrophes(text, spans)
98
+ apostrophes = scan(text, APOSTROPHE, :apostrophe)
99
+ return [] if apostrophes.empty?
100
+
101
+ apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear
102
+
103
+ spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
104
+ next unless s1 && s1.type == :letter
105
+ next unless s2 && s2.type == :apostrophe
106
+ next unless !s3 || s3 && s3.type == :letter
107
+ s3, k = s2, j unless s3
108
+ apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
109
+ spans[i..k] = apostrophes.last
110
+ end
111
+
112
+ apostrophes
113
+ end
114
+
115
+ # Merge some spans that are together.
116
+ #
117
+ # @param spans [Array<Greeb::Span>] already tokenized text.
118
+ #
119
+ # @return [Array<Greeb::Span>] merged spans.
120
+ #
121
+ def together(spans)
122
+ loop do
123
+ converged = true
124
+
125
+ spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
126
+ next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
127
+ spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
128
+ converged = false
129
+ end
130
+
131
+ break if converged
132
+ end
133
+
134
+ spans
135
+ end
136
+
76
137
  private
77
138
  # Implementation of regexp-based {Greeb::Span} scanner.
78
139
  #
@@ -55,12 +55,12 @@ class Greeb::Segmentator
55
55
  # process.
56
56
  # @param stop_marks [Array<Symbol>] an array that stores the
57
57
  # correspondent stop marks of the necessary spans.
58
+ # @param collection [Array<Greeb::Span>] an initial set of spans
59
+ # to be populated.
58
60
  #
59
- # @return [Array<Greeb::Span>] a set of entites.
61
+ # @return [Array<Greeb::Span>] a modified collection.
60
62
  #
61
- def detect_spans(sample, stop_marks)
62
- collection = []
63
-
63
+ def detect_spans(sample, stop_marks, collection = [])
64
64
  rest = tokens.inject(sample.dup) do |span, token|
65
65
  next span if sentence_aint_start? span, token
66
66
  span.from = token.from unless span.from
@@ -77,11 +77,7 @@ class Greeb::Segmentator
77
77
  span
78
78
  end
79
79
 
80
- if rest.from && rest.to
81
- collection << rest
82
- else
83
- collection
84
- end
80
+ rest.from && rest.to ? collection << rest : collection
85
81
  end
86
82
 
87
83
  private
@@ -19,6 +19,16 @@ class Greeb::Span < Struct.new(:from, :to, :type)
19
19
  Struct.new(*self.members, *members)
20
20
  end
21
21
 
22
+ # Select the slice of the given text using coorinates of this span.
23
+ #
24
+ # @param text [String] a text to be extracted.
25
+ #
26
+ # @return [String] the retrieved substring.
27
+ #
28
+ def slice(text)
29
+ text[from...to]
30
+ end
31
+
22
32
  # @private
23
33
  def <=> other
24
34
  if (comparison = self.from <=> other.from) == 0
@@ -47,6 +47,8 @@ module Greeb::Tokenizer
47
47
 
48
48
  # Perform the tokenization process.
49
49
  #
50
+ # @param text [String] a text to be tokenized.
51
+ #
50
52
  # @return [Array<Greeb::Span>] a set of tokens.
51
53
  #
52
54
  def tokenize text
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.2.1'
8
+ VERSION = '0.2.3'
9
9
  end
@@ -21,4 +21,12 @@ describe 'CLI' do
21
21
  invoke(stdin: 'Hello example@example.com guys!').must_equal(
22
22
  %w(Hello example@example.com guys !))
23
23
  end
24
+
25
+ it 'should print version' do
26
+ invoke('-v').join.must_match(/\AGreeb (\d\.)+\d\z/)
27
+ end
28
+
29
+ it 'should print help' do
30
+ invoke('-h').join.must_match(/Usage/)
31
+ end
24
32
  end
@@ -8,9 +8,11 @@ describe Parser do
8
8
  'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
9
9
  'example@example.com! It is available by URL: http://vasya.ru. ' \
10
10
  'Also, <b>G.L.H.F.</b> everyone! It\'s 13:37 or 00:02:28 right ' \
11
- 'now, not 14:89.').freeze
11
+ 'now, not 14:89. What about some Nagibator228?').freeze
12
12
  end
13
13
 
14
+ let(:spans) { Tokenizer.tokenize(text) }
15
+
14
16
  describe 'URL' do
15
17
  subject { Parser.urls(text) }
16
18
 
@@ -67,4 +69,24 @@ describe Parser do
67
69
  )
68
70
  end
69
71
  end
72
+
73
+ describe 'APOSTROPHE' do
74
+ subject { Parser.apostrophes(text, spans.dup) }
75
+
76
+ it 'recognizes apostrophes' do
77
+ subject.must_equal(
78
+ [Span.new(220, 224, :letter)]
79
+ )
80
+ end
81
+ end
82
+
83
+ describe 'TOGETHER' do
84
+ subject { Parser.together(spans.dup) }
85
+
86
+ it 'merges connected spans' do
87
+ subject.select { |s| s.type == :together }.must_equal(
88
+ [Span.new(281, 293, :together)]
89
+ )
90
+ end
91
+ end
70
92
  end
@@ -60,4 +60,14 @@ describe Span do
60
60
  Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
61
61
  end
62
62
  end
63
+
64
+ describe 'slicing' do
65
+ let(:text) { 'test228' }
66
+
67
+ subject { Span.new(4, 7) }
68
+
69
+ it 'should extract slices using #slice' do
70
+ subject.slice(text).must_equal '228'
71
+ end
72
+ end
63
73
  end
@@ -19,11 +19,15 @@ class MiniTest::Test
19
19
  arguments = argv.dup
20
20
  options = (arguments.last.is_a? Hash) ? arguments.pop : {}
21
21
  executable = File.expand_path('../../../bin/greeb', __FILE__)
22
+ status = nil
22
23
 
23
- Open3.popen3(executable, *arguments) do |i, o, *_|
24
+ Open3.popen3(executable, *arguments) do |i, o, _, t|
24
25
  i.puts options[:stdin] if options[:stdin]
25
26
  i.close
26
27
  invoke_cache[argv] = o.readlines.map(&:chomp!)
28
+ status = t.value
27
29
  end
30
+
31
+ invoke_cache[argv] if status.success?
28
32
  end
29
33
  end
metadata CHANGED
@@ -1,27 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2.1
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-15 00:00:00.000000000 Z
11
+ date: 2014-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '5.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '5.0'
27
27
  description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
@@ -33,8 +33,8 @@ executables:
33
33
  extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
- - .gitignore
37
- - .travis.yml
36
+ - ".gitignore"
37
+ - ".travis.yml"
38
38
  - Gemfile
39
39
  - LICENSE
40
40
  - README.md
@@ -68,17 +68,17 @@ require_paths:
68
68
  - lib
69
69
  required_ruby_version: !ruby/object:Gem::Requirement
70
70
  requirements:
71
- - - '>='
71
+ - - ">="
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  required_rubygems_version: !ruby/object:Gem::Requirement
75
75
  requirements:
76
- - - '>='
76
+ - - ">="
77
77
  - !ruby/object:Gem::Version
78
78
  version: '0'
79
79
  requirements: []
80
80
  rubyforge_project: greeb
81
- rubygems_version: 2.1.9
81
+ rubygems_version: 2.2.2
82
82
  signing_key:
83
83
  specification_version: 4
84
84
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.