pragmatic_tokenizer 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1b96c776d10f262827ba3a798455e9d942ddf2b9
4
- data.tar.gz: 81951d2c6047ba10182603ced7b02fabdcbe9e04
3
+ metadata.gz: 93070f0374fe808125850e652cb0f05cdd262c3e
4
+ data.tar.gz: 0e69cfc518e9b734ab35e47d7eb0d656aaa22a76
5
5
  SHA512:
6
- metadata.gz: 131b8bd0ce12923f396a028b6c14d5ae20291ec76e31fc53b618eae63078df5a66cc2ff4fa24c25edee987d3ae693d1b7ed6118b67aafa7bd00eb53db01ac2af
7
- data.tar.gz: 365ec8f6e0a7ac097e962906f7d4b8a5f92e0b50516e6d264bfd9e17c20981bb4093e83bf0df40d5e6578bdc0275c30bde1bc7a27c0503fa8178e9edbd6769a8
6
+ metadata.gz: e96d74097ee19c69c03952341be97f425200ea056dabd0b36dfae4387df78ca1920e521aaad1c28aef0e3d07b35d68abd21b54d7e21fcfc110c873b1ee319449
7
+ data.tar.gz: 908a93161d227b7cdca4208157a339c12c075b416ceed0a59757a9227ebde77c1c87e9deae694ba2445cbd1a2dd00f8cb81a91051826de3b8596654272a4e9b4
@@ -40,21 +40,25 @@ module PragmaticTokenizer
40
40
 
41
41
  def tokenize
42
42
  return [] unless text
43
- downcase_tokens(
44
- cleaner(
45
- remove_short_tokens(
46
- delete_numbers(
47
- delete_roman_numerals(
48
- find_contractions(
49
- delete_stop_words(
50
- remove_punctuation(
51
- split_at_middle_period_1(
52
- split_at_middle_period_2(
53
- split_beginning_period(
54
- shift_no_spaces_between_sentences(
55
- split_at_forward_slash(
56
- processor.new(language: language_module).process(text: text)
57
- ))))))))))))).reject { |t| t.empty? }
43
+ tokens = []
44
+ text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
45
+ tokens << downcase_tokens(
46
+ cleaner(
47
+ remove_short_tokens(
48
+ delete_numbers(
49
+ delete_roman_numerals(
50
+ find_contractions(
51
+ delete_stop_words(
52
+ remove_punctuation(
53
+ split_at_middle_period_1(
54
+ split_at_middle_period_2(
55
+ split_beginning_period(
56
+ shift_no_spaces_between_sentences(
57
+ split_at_forward_slash(
58
+ processor.new(language: language_module).process(text: segment)
59
+ ))))))))))))).reject { |t| t.empty? }
60
+ end
61
+ tokens.flatten
58
62
  end
59
63
 
60
64
  def domains
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.3.2"
2
+ VERSION = "0.3.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias