pragmatic_tokenizer 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1b96c776d10f262827ba3a798455e9d942ddf2b9
4
- data.tar.gz: 81951d2c6047ba10182603ced7b02fabdcbe9e04
3
+ metadata.gz: 93070f0374fe808125850e652cb0f05cdd262c3e
4
+ data.tar.gz: 0e69cfc518e9b734ab35e47d7eb0d656aaa22a76
5
5
  SHA512:
6
- metadata.gz: 131b8bd0ce12923f396a028b6c14d5ae20291ec76e31fc53b618eae63078df5a66cc2ff4fa24c25edee987d3ae693d1b7ed6118b67aafa7bd00eb53db01ac2af
7
- data.tar.gz: 365ec8f6e0a7ac097e962906f7d4b8a5f92e0b50516e6d264bfd9e17c20981bb4093e83bf0df40d5e6578bdc0275c30bde1bc7a27c0503fa8178e9edbd6769a8
6
+ metadata.gz: e96d74097ee19c69c03952341be97f425200ea056dabd0b36dfae4387df78ca1920e521aaad1c28aef0e3d07b35d68abd21b54d7e21fcfc110c873b1ee319449
7
+ data.tar.gz: 908a93161d227b7cdca4208157a339c12c075b416ceed0a59757a9227ebde77c1c87e9deae694ba2445cbd1a2dd00f8cb81a91051826de3b8596654272a4e9b4
@@ -40,21 +40,25 @@ module PragmaticTokenizer
40
40
 
41
41
  def tokenize
42
42
  return [] unless text
43
- downcase_tokens(
44
- cleaner(
45
- remove_short_tokens(
46
- delete_numbers(
47
- delete_roman_numerals(
48
- find_contractions(
49
- delete_stop_words(
50
- remove_punctuation(
51
- split_at_middle_period_1(
52
- split_at_middle_period_2(
53
- split_beginning_period(
54
- shift_no_spaces_between_sentences(
55
- split_at_forward_slash(
56
- processor.new(language: language_module).process(text: text)
57
- ))))))))))))).reject { |t| t.empty? }
43
+ tokens = []
44
+ text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
45
+ tokens << downcase_tokens(
46
+ cleaner(
47
+ remove_short_tokens(
48
+ delete_numbers(
49
+ delete_roman_numerals(
50
+ find_contractions(
51
+ delete_stop_words(
52
+ remove_punctuation(
53
+ split_at_middle_period_1(
54
+ split_at_middle_period_2(
55
+ split_beginning_period(
56
+ shift_no_spaces_between_sentences(
57
+ split_at_forward_slash(
58
+ processor.new(language: language_module).process(text: segment)
59
+ ))))))))))))).reject { |t| t.empty? }
60
+ end
61
+ tokens.flatten
58
62
  end
59
63
 
60
64
  def domains
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.3.2"
2
+ VERSION = "0.3.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias