RubyGems - pragmatic_segmenter - Versions diffs - 0.0.1 - Mend

pragmatic_segmenter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +1 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +730 -0
data/Rakefile +4 -0
data/lib/pragmatic_segmenter.rb +2 -0
data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
data/lib/pragmatic_segmenter/cleaner.rb +141 -0
data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
data/lib/pragmatic_segmenter/languages/english.rb +44 -0
data/lib/pragmatic_segmenter/languages/french.rb +29 -0
data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
data/lib/pragmatic_segmenter/list.rb +169 -0
data/lib/pragmatic_segmenter/number.rb +35 -0
data/lib/pragmatic_segmenter/process.rb +126 -0
data/lib/pragmatic_segmenter/punctuation.rb +12 -0
data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
data/lib/pragmatic_segmenter/rules.rb +38 -0
data/lib/pragmatic_segmenter/segmenter.rb +81 -0
data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
data/lib/pragmatic_segmenter/types.rb +12 -0
data/lib/pragmatic_segmenter/version.rb +3 -0
data/pragmatic_segmenter.gemspec +25 -0
data/spec/performance_spec.rb +24 -0
data/spec/pragmatic_segmenter_spec.rb +1906 -0
data/spec/spec_helper.rb +1 -0
metadata +150 -0

data/lib/pragmatic_segmenter/languages/italian.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module PragmaticSegmenter
+  module Languages
+    class Italian
+      class Process < PragmaticSegmenter::Process
+        private
+        def replace_abbreviations(txt)
+          PragmaticSegmenter::Languages::Italian::AbbreviationReplacer.new(text: txt).replace
+        end
+      end
+      class Abbreviation < PragmaticSegmenter::Abbreviation
+        ABBREVIATIONS = ['1°', 'a.c', 'a.c/a', 'a.cam', 'a.civ', 'a.cor', 'a.d.r', 'a.gov', 'a.mil', 'a.mon', 'a.smv', 'a.v', 'a/a', 'a/c', 'a/i', 'aa', 'aaaa', 'aaal', 'aacst', 'aamct', 'aams', 'aar', 'aato', 'ab', 'abbigl', 'abbrev', 'abc', 'abi', 'abl', 'abm', 'abr', 'abs', 'absp', 'ac', 'acam', 'acb', 'acbi', 'acc', 'accorc', 'accr', 'acd', 'ace', 'acec', 'acep', 'aci', 'acli', 'acp', 'acro', 'acsit', 'actl', 'ad', 'ad.mil', 'ada', 'adap', 'adatt', 'adc', 'add', 'adei', 'adeion', 'adhd', 'adi', 'adisco', 'adj', 'adm', 'adp', 'adr', 'ads', 'adsi', 'adsl', 'adv', 'ae.b', 'aefi', 'aer', 'aerodin', 'aeron', 'afa', 'afc', 'afci', 'affl', 'afi', 'afic', 'afm', 'afp', 'ag', 'agcm', 'agcom', 'age', 'agecs', 'agesci', 'agg', 'agip', 'agis', 'agm', 'ago', 'agr', 'agric', 'agt', 'ai', 'aia', 'aiab', 'aiac', 'aiace', 'aiap', 'aias', 'aiat', 'aib', 'aic', 'aica', 'aicel', 'aici', 'aics', 'aid', 'aida', 'aidaa', 'aidac', 'aidama', 'aidda', 'aidim', 'aido', 'aids', 'aies', 'aif', 'aih', 'aiip', 'aimi', 'aip', 'aipsc', 'airi', 'ais', 'aisa', 'aism', 'aiss', 'aissca', 'aitc', 'aiti', 'aitr', 'aits', 'aka', 'al', 'alai', 'alch', 'alg', 'ali', 'alim', 'all', 'allev', 'allus', 'alp', 'alq', 'alt', 'am', 'ama', 'amaci', 'amag', 'amami', 'amc', 'ammec', 'amn', 'ampas', 'amps', 'an', 'ana', 'anaai', 'anac', 'anaci', 'anad', 'anai', 'anaoo', 'anart', 'anat', 'anat. comp', 'ancci', 'anci', 'ancip', 'ancsa', 'andit', 'anec', 'anee', 'anem', 'anes', 'anffas', 'ani', 'ania', 'anica', 'anie', 'animi', 'anis', 'anisc', 'anm', 'anmfit', 'anmig', 'anmil', 'anmli', 'anms', 'anpa', 'anpas', 'anpci', 'anpe', 'anpi', 'ansi', 'ansv', 'ant', 'anta', 'antifr', 'antlo', 'anton', 'antrop', 'anusca', 'anvi', 'anx', 'ao', 'ap', 'apa', 'apd', 'apea', 'apec', 'apet', 'api', 'apos', 'app', 'app.sc', 'apr', 'aps', 'apt', 'aq', 'ar', 'ar.ind', 'ar.rep', 'arald', 'arame', 'arc', 'arch', 'archeol', 'arci', 'ardsu', 'are', 'arg', 'aritm', 'arpa', 'arpat', 'arred', 'arrt', 'arsia', 'art', 'arti min', 'artig', 'artigl', 'artt', 'as', 'asa', 'asae', 'asc', 'asci', 'ascii', 'ascom', 'ascop', 'asd', 'ase', 'asf', 'asfer', 'asg', 'asic', 'asifa', 'asl', 'asmdc', 'asmi', 'asp', 'aspic', 'aspp', 'assi', 'assic', 'assol', 'asst', 'aster', 'astr', 'astrol', 'astron', 'at', 'ata', 'atb', 'atic', 'atm', 'ats', 'att', 'attrav', 'atv', 'au', 'auc', 'aus', 'auser', 'aut', 'autom', 'av', 'avi', 'avis', 'avo', 'avv', 'avvers', 'awb', 'awdp', 'az', 'azh', 'b.a', 'b2b', 'b2c', 'ba', 'bafta', 'bal', 'ball', 'ban', 'banc', 'bar', 'bart', 'bas', 'bat', 'batt', 'bban', 'bbc', 'bbl', 'bbs', 'bbtc', 'bcc', 'bce', 'bcf', 'bdf', 'bei', 'bep', 'bers', 'bg', 'bi', 'bibl', 'bic', 'bioch', 'biol', 'bl', 'bld', 'bldg', 'blpc', 'bm', 'bmps', 'bmw', 'bn', 'bna', 'bncf', 'bncrm', 'bni', 'bnl', 'bo', 'bot', 'bpl', 'bpm', 'bpn', 'bpr', 'br', 'brd', 'bre', 'bric', 'brig', 'brig.ca', 'brig.gen', 'bros', 'bs', 'bsc', 'bsp', 'bsu', 'bt', 'btc', 'btg', 'btg.l', 'btr', 'bts', 'bu', 'bur', 'bz', 'c.a', 'c.a.p', 'c.c.p', 'c.cost', 'c.d a', 'c.d', 'c.le', 'c.m', 'c.opv', 'c.p', 'c.s', 'c.v', 'c.v.d', 'c/a', 'c/c', 'c/pag', 'ca', 'ca.rep', 'ca.sm', 'ca.sz', 'ca.uf', 'caaf', 'cab', 'cad', 'cae', 'cai', 'cal', 'cam', 'cap', 'capol', 'capt', 'car', 'car.sc', 'carat', 'card', 'cas', 'casaca', 'casd', 'cass.civ', 'cat', 'caus', 'cav', 'cavg', 'cb', 'cbd', 'cbr', 'cbs', 'cc', 'cca', 'ccap', 'ccda', 'ccdp', 'ccee', 'cciaa', 'ccie', 'ccip', 'cciss', 'ccna', 'ccnl', 'ccnp', 'ccpb', 'ccs', 'ccsp', 'cctld', 'cctv', 'ccv', 'cd', 'cda', 'cdma', 'cdo', 'cdpd', 'cdr', 'cds', 'cdw', 'ce', 'ced', 'cee', 'cei', 'cemat', 'cenelec', 'centr', 'cepis', 'ceps', 'cept', 'cerit', 'cese', 'cesis', 'cesvot', 'cet', 'cf', 'cfa', 'cfr', 'cg', 'cgi', 'cgil', 'cgs', 'ch', 'chf', 'chim', 'chim. ind', 'chir', 'ci', 'ci-europa', 'ciber', 'cicae', 'cid', 'cie', 'cif', 'cifej', 'cig', 'cigs', 'cii', 'cilea', 'cilo', 'cim', 'cime', 'cin', 'cinit', 'cio', 'cipe', 'cirm', 'cisal', 'ciscs', 'cisd', 'cisl', 'cism', 'citol', 'cl', 'class', 'cli', 'cm', 'cmdr', 'cme', 'cmo', 'cmr', 'cms', 'cmyk', 'cm²', 'cm³', 'cn', 'cna', 'cnb', 'cnc', 'cnel', 'cngei', 'cni', 'cnipa', 'cnit', 'cnn', 'cnr', 'cns', 'cnt', 'cnvvf', 'co', 'co.ing', 'co.sa', 'cobas', 'coc', 'cod', 'cod. civ', 'cod. deont. not', 'cod. pen', 'cod. proc. civ', 'cod. proc. pen', 'codec', 'coi', 'col', 'colf', 'coll', 'com', 'comdr', 'comm', 'comp', 'compar', 'compl', 'con', 'conai', 'conc', 'concl', 'condiz', 'confetra', 'confitarma', 'confr', 'cong', 'congeav', 'congiunt', 'coni', 'coniug', 'consec', 'consob', 'contab', 'contr', 'coreco', 'corp', 'corr', 'correl', 'corrisp', 'cosap', 'cospe', 'cost', 'costr', 'cpc', 'cpdel', 'cpe', 'cpi', 'cpl', 'cpt', 'cpu', 'cr', 'cral', 'credem', 'crf', 'cri', 'cric', 'cristall', 'crm', 'cro', 'cron', 'crsm', 'crt', 'cs', 'csa', 'csai', 'csc', 'csm', 'csn', 'css', 'ct', 'ctc', 'cti', 'ctr', 'ctsis', 'cuc', 'cud', 'cun', 'cup', 'cusi', 'cvb', 'cvbs', 'cwt', 'cz', 'd', 'd.c', 'd.i.a', 'dab', 'dac', 'dam', 'dams', 'dat', 'dau', 'db', 'dbms', 'dc', 'dca', 'dccc', 'dda', 'ddp', 'ddr', 'ddt', 'dea', 'decoraz', 'dect', 'dek', 'denom', 'deriv', 'derm', 'determ', 'df', 'dfp', 'dg', 'dga', 'dhcp', 'di', 'dia', 'dial', 'dic', 'dicomac', 'dif', 'difett', 'dig. iv', 'digos', 'dimin', 'dimostr', 'din', 'dipart', 'diplom', 'dir', 'dir. amm', 'dir. can', 'dir. civ', 'dir. d. lav', 'dir. giur', 'dir. internaz', 'dir. it', 'dir. pen', 'dir. priv', 'dir. proces', 'dir. pub', 'dir. rom', 'disus', 'diy', 'dl', 'dlf', 'dm', 'dme', 'dmf', 'dmo', 'dmoz', 'dm²', 'dm³', 'dnr', 'dns', 'doa', 'doc', 'docg', 'dom', 'dop', 'dos', 'dott', 'dpa', 'dpi', 'dpl', 'dpof', 'dps', 'dpt', 'dr', 'dra', 'drm', 'drs', 'dry pt', 'ds', 'dslam', 'dspn', 'dss', 'dtc', 'dtmf', 'dtp', 'dts', 'dv', 'dvb', 'dvb-t', 'dvd', 'dvi', 'dwdm', 'e.g', 'e.p.c', 'ead', 'eafrd', 'ean', 'eap', 'easw', 'eb', 'eban', 'ebr', 'ebri', 'ebtn', 'ecc', 'eccl', 'ecdl', 'ecfa', 'ecff', 'ecg', 'ecm', 'econ', 'econ. az', 'econ. dom', 'econ. pol', 'ecpnm', 'ed', 'ed agg', 'edge', 'edi', 'edil', 'edit', 'ef', 'efa', 'efcb', 'efp', 'efsa', 'efta', 'eg', 'egiz', 'egl', 'egr', 'ei', 'eisa', 'elab', 'elettr', 'elettron', 'ellitt', 'emap', 'emas', 'embr', 'emdr', 'emi', 'emr', 'en', 'enaip', 'enal', 'enaoli', 'enapi', 'encat', 'enclic', 'enea', 'enel', 'eni', 'enigm', 'enit', 'enol', 'enpa', 'enpaf', 'enpals', 'enpi', 'enpmf', 'ens', 'entom', 'epd', 'epigr', 'epirbs', 'epl', 'epo', 'ept', 'erc', 'ercom', 'ermes', 'erp', 'es', 'esa', 'escl', 'esist', 'eso', 'esp', 'estens', 'estr. min', 'etacs', 'etf', 'eti', 'etim', 'etn', 'etol', 'eu', 'eufem', 'eufic', 'eula', 'eva®', 'f.a', 'f.b', 'f.m', 'f.p', 'fa', 'fabi', 'fac', 'facl', 'facs', 'fad', 'fai', 'faile', 'failp', 'failpa', 'faisa', 'falcri', 'fam', 'famar', 'fans', 'fao', 'fapav', 'faq', 'farm', 'fasi', 'fasib', 'fatt', 'fbe', 'fbi', 'fc', 'fco', 'fcp', 'fcr', 'fcu', 'fdi', 'fe', 'feaog', 'feaosc', 'feb', 'fedic', 'fema', 'feoga', 'ferr', 'fesco', 'fesr', 'fess', 'fg', 'fi', 'fiaf', 'fiaip', 'fiais', 'fialtel', 'fiap', 'fiapf', 'fiat', 'fiavet', 'fic', 'ficc', 'fice', 'fidal', 'fidam', 'fidapa', 'fieg', 'fifa', 'fifo', 'fig', 'figc', 'figs', 'filat', 'filcams', 'file', 'filol', 'filos', 'fim', 'fima', 'fimmg', 'fin', 'finco', 'fio', 'fioto', 'fipe', 'fipresci', 'fis', 'fisar', 'fisc', 'fisg', 'fisiol', 'fisiopatol', 'fistel', 'fit', 'fita', 'fitav', 'fits', 'fiv', 'fivet', 'fivl', 'flo', 'flpd', 'fluid pt', 'fm', 'fmcg', 'fmi', 'fmth', 'fnas', 'fnomceo', 'fnsi', 'fob', 'fod', 'folcl', 'fon', 'fop', 'fotogr', 'fp', 'fpc', 'fpld', 'fr', 'fra', 'fs', 'fsc', 'fse', 'fsf', 'fsfi', 'fsh', 'ft', 'ftase', 'ftbcc', 'fte', 'ftp', 'fts', 'ft²', 'ft³', 'fuaav', 'fut', 'fv', 'fvg', 'g.fv', 'g.u', 'g.u.el', 'gal', 'gats', 'gatt', 'gb', 'gc', 'gccc', 'gco', 'gcost', 'gd', 'gdd', 'gdf', 'gdi', 'gdo', 'gdp', 'ge', 'gea', 'gel', 'gen', 'geneal', 'geod', 'geofis', 'geogr', 'geogr. antr', 'geogr. fis', 'geol', 'geom', 'gep', 'germ', 'gescal', 'gg', 'ggv', 'gi', 'gia', 'gides', 'gift', 'gio', 'giorn', 'gis', 'gisma', 'gismo', 'giu', 'gm', 'gmdss', 'gme', 'gmo', 'go', 'gov', 'gp', 'gpl', 'gprs', 'gps', 'gr', 'gr.sel.spec', 'gr.sel.tr', 'gr.sqd', 'gra', 'gram', 'grano', 'grd', 'grtn', 'grv', 'gsa', 'gsm', 'gsm-r', 'gsr', 'gtld', 'gu', 'guce', 'gui', 'gus', 'ha', 'haart', 'haccp', 'hba', 'hcg', 'hcrp', 'hd-dvd', 'hdcp', 'hdi', 'hdml', 'hdtv', 'hepa', 'hfpa', 'hg', 'hifi', 'hiperlan', 'hiv', 'hm', 'hmld', 'hon', 'hosp', 'hpv', 'hr', 'hrh', 'hrm', 'hrt', 'html', 'http', 'hvac', 'hz', 'i.e', 'i.g.m', 'iana', 'iasb', 'iasc', 'iass', 'iat', 'iata', 'iatse', 'iau', 'iban', 'ibid', 'ibm', 'icann', 'icao', 'icbi', 'iccu', 'ice', 'icf', 'ici', 'icm', 'icom', 'icon', 'ics', 'icsi', 'icstis', 'ict', 'icta', 'id', 'iden', 'idl', 'idraul', 'iec', 'iedm', 'ieee', 'ietf', 'ifat', 'ifel', 'ifla', 'ifrs', 'ifto', 'ifts', 'ig', 'igm', 'igmp', 'igp', 'iims', 'iipp', 'ilm', 'ilo', 'ilor', 'ils', 'im', 'imaie', 'imap', 'imc', 'imdb', 'imei', 'imi', 'imms', 'imo', 'imp', 'imper', 'imperf', 'impers', 'imq', 'ims', 'imsi', 'in', 'inail', 'inca', 'incb', 'inci', 'ind', 'ind. agr', 'ind. alim', 'ind. cart', 'ind. chim', 'ind. cuoio', 'ind. estratt', 'ind. graf', 'ind. mecc', 'ind. tess', 'indecl', 'indef', 'indeterm', 'indire', 'inea', 'inf', 'infea', 'infm', 'inform', 'ing', 'ingl', 'inmarsat', 'inpdai', 'inpdap', 'inpgi', 'inps', 'inr', 'inran', 'ins', 'insp', 'int', 'inter', 'intr', 'invar', 'invim', 'in²', 'in³', 'ioma', 'iosco', 'ip', 'ipab', 'ipasvi', 'ipi', 'ippc', 'ips', 'iptv', 'iq', 'ira', 'irap', 'ircc', 'ircs', 'irda', 'iref', 'ires', 'iron', 'irpef', 'irpeg', 'irpet', 'irreg', 'is', 'isae', 'isbd', 'isbn', 'isc', 'isdn', 'isee', 'isef', 'isfol', 'isg', 'isi', 'isia', 'ism', 'ismea', 'isnart', 'iso', 'isp', 'ispearmi', 'ispel', 'ispescuole', 'ispesl', 'ispo', 'ispro', 'iss', 'issn', 'istat', 'istol', 'isvap', 'it', 'iti', 'itt', 'ittiol', 'itu', 'iud', 'iugr', 'iulm', 'iva', 'iveco', 'ivg', 'ivr', 'ivs', 'iyhp', 'j', 'jal', 'jit', 'jr', 'jv', 'k', 'kb', 'kee', 'kg', 'kkk', 'klm', 'km', 'km/h', 'kmph', 'kmq', 'km²', 'kr', 'kw', 'kwh', 'l', 'l\'ing', 'l.n', 'l\'avv', 'la', 'lag', 'lan', 'lanc', 'larn', 'laser', 'lat', 'lav', 'lav. femm', 'lav. pubbl', 'laz', 'lb', 'lc', 'lcca', 'lcd', 'le', 'led', 'lett', 'lh', 'li', 'liaf', 'lib', 'lic', 'lic.ord', 'lic.strd', 'licd', 'lice', 'lida', 'lidci', 'liff', 'lifo', 'lig', 'liit', 'lila', 'lilt', 'linfa', 'ling', 'lipu', 'lis', 'lisaac', 'lism', 'lit', 'litab', 'lnp', 'lo', 'loc', 'loc. div', 'lolo', 'lom', 'long', 'lp', 'lrm', 'lrms', 'lsi', 'lsu', 'lt', 'ltd', 'lu', 'lug', 'luiss', 'lun', 'lwt', 'lww', 'm.a', 'm.b', 'm.o', 'm/s', 'ma', 'mac', 'macch', 'mag', 'magg.(maj)', 'magg.gen.(maj.gen.)', 'mai', 'maj', 'mar', 'mar.a', 'mar.ca', 'mar.ord', 'marc', 'mat', 'mater', 'max', 'mb', 'mbac', 'mc', 'mcl', 'mcpc', 'mcs', 'md', 'mdf', 'mdp', 'me', 'mec', 'mecc', 'med', 'mediev', 'mef', 'mer', 'merc', 'merid', 'mesa', 'messrs', 'metall', 'meteor', 'metr', 'metrol', 'mg', 'mgc', 'mgm', 'mi', 'mibac', 'mica', 'microb', 'mifed', 'miglio nautico', 'miglio nautico per ora', 'miglio nautico²', 'miglio²', 'mil', 'mile', 'miles/h', 'milesph', 'min', 'miner', 'mips', 'miptv', 'mit', 'mitol', 'miur', 'ml', 'mlle', 'mls', 'mm', 'mme', 'mms', 'mm²', 'mn', 'mnp', 'mo', 'mod', 'mol', 'mons', 'morf', 'mos', 'mpaa', 'mpd', 'mpeg', 'mpi', 'mps', 'mq', 'mr', 'mrs', 'ms', 'msgr', 'mss', 'mt', 'mto', 'murst', 'mus', 'mvds', 'mws', 'm²', 'm³', 'n.a', 'n.b', 'na', 'naa', 'nafta', 'napt', 'nars', 'nasa', 'nat', 'natas', 'nato', 'nb', 'nba', 'nbc', 'ncts', 'nd', 'nda', 'nde', 'ndr', 'ndt', 'ne', 'ned', 'neg', 'neol', 'netpac', 'neur', 'news!', 'ngcc', 'nhmf', 'nlcc', 'nmr', 'no', 'nodo', 'nom', 'nos', 'nov', 'novissdi', 'npi', 'nr', 'nt', 'nta', 'nts', 'ntsc', 'nu', 'nuct', 'numism', 'nwt', 'nyc', 'nz', 'o.m.i', 'oai-pmh', 'oav', 'oc', 'occ', 'occult', 'oci', 'ocr', 'ocse', 'oculist', 'od', 'odg', 'odp', 'oecd', 'oem', 'ofdm', 'oft', 'og', 'ogg', 'ogi', 'ogm', 'ohim', 'oic', 'oics', 'olaf', 'oland', 'ole', 'oled', 'omi', 'oms', 'on', 'ong', 'onig', 'onlus', 'onomat', 'onpi', 'onu', 'op', 'opac', 'opec', 'opord', 'opsosa', 'or', 'ord', 'ord. scol', 'ore', 'oref', 'orient', 'ornit', 'orogr', 'orp', 'ort', 'os', 'osa', 'osas', 'osd', 'ot', 'ote', 'ott', 'oz', 'p', 'p.a', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.m', 'p.r', 'p.s', 'p.t', 'p.v', 'pa', 'pac', 'pag./p', 'pagg./pp', 'pai', 'pal', 'paleobot', 'paleogr', 'paleont', 'paleozool', 'paletn', 'pamr', 'pan', 'papir', 'par', 'parapsicol', 'part', 'partic', 'pass', 'pat', 'patol', 'pb', 'pc', 'pci', 'pcm', 'pcmcia', 'pcs', 'pcss', 'pct', 'pd', 'pda', 'pdf', 'pdl', 'pds', 'pe', 'pec', 'ped', 'pedag', 'peg', 'pegg', 'per.ind', 'pers', 'pert', 'pesq', 'pet', 'petr', 'petrogr', 'pfc', 'pg', 'pga', 'pgp', 'pgut', 'ph', 'php', 'pi', 'pics', 'pie', 'pif', 'pii', 'pil', 'pime', 'pin', 'pine', 'pip', 'pir', 'pit', 'pitt', 'piuss', 'pkcs', 'pki', 'pko', 'pl', 'pli', 'plr', 'pm', 'pma', 'pmi', 'pmr', 'pn', 'pnf', 'pnl', 'po', 'poet', 'pof', 'pol', 'pop', 'popitt', 'popol', 'port', 'pos', 'poss', 'post', 'pots', 'pp', 'ppa', 'ppc', 'ppga', 'ppp', 'pps', 'pptt', 'ppv', 'pr', 'pra', 'praa', 'pref', 'preist', 'prep', 'pres', 'pret', 'prg', 'pri', 'priv', 'pro.civ', 'prof', 'pron', 'pronom', 'propr', 'prov', 'prs', 'prtl', 'prusst', 'ps', 'pse', 'psi', 'psicoan', 'psicol', 'pso', 'psp', 'pstn', 'pt', 'ptc', 'pti', 'ptsd', 'ptt', 'pu', 'pug', 'puk', 'put', 'pv', 'pvb', 'pvc', 'pvt', 'pz', 'qb', 'qcs', 'qfd', 'qg', 'qi', 'qlco', 'qlcu', 'qos', 'qualif', 'r-lan', 'r.s', 'ra', 'racc', 'radar', 'radc', 'radiotecn', 'raee', 'raf', 'rag', 'raid', 'ram', 'rar', 'ras', 'rass. avv. stato', 'rc', 'rca', 'rcdp', 'rcs', 'rdc', 'rdco', 'rdf', 'rdi', 'rdp', 'rds', 'rdt', 're', 'rea', 'recipr', 'recl', 'reg', 'region', 'rel', 'rem', 'rep', 'reps', 'res', 'retor', 'rev', 'rfi', 'rfid', 'rg', 'rgb', 'rgc', 'rge', 'rgi', 'rgi bdp', 'rgpt', 'rgt', 'ri', 'riaa', 'riaj', 'riba', 'ric', 'rid', 'rif', 'rifl', 'rina', 'rip', 'ris', 'rit', 'ritts', 'rm', 'rmn', 'rn', 'ro', 'roa', 'roc', 'roi', 'rom', 'roro', 'rov', 'rp', 'rpm', 'rr', 'rrf', 'rs', 'rsc', 'rspp', 'rss', 'rsu', 'rsvp', 'rt', 'rtdpc', 'rtg', 'rtn', 'rtp', 'rttt', 'rvm', 's-dab', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 's.ten', 's.v', 's/m', 'sa', 'sab', 'saca', 'sace', 'sact', 'sad', 'sag', 'sahm', 'sai', 'saisa', 'sam', 'san', 'sanas', 'sape', 'sar', 'sars', 'sart', 'sas', 'sbaf', 'sbas', 'sbn', 'sc', 'sca.sm', 'scherz', 'scien', 'scn', 'scsi', 'scuba', 'scult', 'scut', 'sdds', 'sdiaf', 'sds', 'sdsl', 'se', 'seat', 'sebc', 'sec', 'seca', 'secam', 'secc', 'see', 'seg', 'segg', 'segredifesa', 'sem', 'sempo', 'sen', 'sens', 'seo', 'serg', 'serg.magg.(sgm)', 'serg.magg.ca', 'set', 'sfc', 'sfis', 'sfx', 'sg', 'sga', 'sgc', 'sgg', 'sgml', 'sgt', 'si', 'si@lt', 'sia', 'siae', 'siaic', 'siap', 'sias', 'sic', 'sicav', 'sid', 'sido', 'sie', 'sif', 'sig', 'sig.na', 'sig.ra', 'sige', 'sigg', 'sigill', 'sigo', 'siia', 'simb', 'simbdea', 'simg', 'simo', 'sin', 'sinalv', 'sing', 'sins', 'sinu', 'siocmf', 'siog', 'sioi', 'siommms', 'siot', 'sip', 'sipem', 'sips', 'sirf', 'sirm', 'sis', 'sisde', 'sismi', 'sissa', 'sit', 'siulp', 'siusa', 'sla', 'sldn', 'slm', 'slr', 'sm', 'sma', 'smau', 'smd', 'sme', 'smes', 'smm', 'smpt', 'sms', 'sn', 'snad', 'snai', 'snc', 'sncci', 'sncf', 'sngci', 'snit', 'so', 'soc', 'sociol', 'sogg', 'soho', 'soi', 'sol', 'somipar', 'somm', 'sonar', 'sp', 'spa', 'spe', 'spett', 'spi', 'spm', 'spot', 'spp', 'spreg', 'sq', 'sqd', 'sr', 'srd', 'srl', 'srr', 'ss', 'ssi', 'ssn', 'ssr', 'sss', 'st', 'st. d. arte', 'st. d. dir', 'st. d. filos', 'st. d. rel', 'stat', 'stg', 'stp', 'stw', 'su', 'suap', 'suem', 'suff', 'sup', 'superl', 'supt', 'surg', 'surl', 'susm', 'sut', 'suv', 'sv', 'svga', 'swics', 'swift', 'swot', 'sxga', 'sz', 't-dab', 't.sg', 'ta', 'taa', 'tac', 'tacan', 'tacs', 'taeg', 'tai', 'tan', 'tar', 'targa', 'tav', 'tb', 'tbt', 'tci', 'tcp', 'tcp/ip', 'tcsm', 'tdm', 'tdma', 'te', 'tecn', 'tecnol', 'ted', 'tel', 'telecom', 'temp', 'ten.(lt)', 'ten.col.(ltc)', 'ten.gen', 'teol', 'term', 'tesa', 'tese', 'tesol', 'tess', 'tet', 'tetra', 'tfr', 'tft', 'tfts', 'tgv', 'thx', 'tim', 'tipogr', 'tir', 'tit', 'tld', 'tm', 'tmc', 'tn', 'to', 'toefl', 'ton', 'top', 'topog', 'tos', 'tosap', 'tosc', 'tp', 'tpl', 'tr', 'trad', 'tramat', 'trasp', 'ts', 'tso', 'tuir', 'tuld', 'tv', 'twa', 'twain', 'u.ad', 'u.s', 'ucai', 'ucca', 'ucei', 'ucina', 'uclaf', 'ucoi', 'ucoii', 'ucsi', 'ud', 'udc', 'udi', 'udp', 'ue', 'uefa', 'uemri', 'ufo', 'ugc', 'uhci', 'uhf', 'uht', 'uibm', 'uic', 'uicc', 'uiga', 'uil', 'uilps', 'uisp', 'uits', 'uk', 'ul', 'ull', 'uma', 'umb', 'ummc', 'umss', 'umts', 'unac', 'unar', 'unasp', 'uncem', 'unctad', 'undp', 'unefa', 'unep', 'unesco', 'ungh', 'unhcr', 'uni', 'unicef', 'unitec', 'unpredep', 'unsa', 'upa', 'upc', 'urar', 'urban', 'url', 'urp', 'urss', 'usa', 'usb', 'usfi', 'usga', 'usl', 'usp', 'uspi', 'ussr', 'utap', 'v', 'v.brig', 'v.cte', 'v.m', 'v.p', 'v.r', 'v.s', 'va', 'vab', 'vaio', 'val', 'vas', 'vb', 'vbr', 'vc', 'vcc', 'vcr', 'vda', 've', 'ven', 'ves', 'vesa', 'veter', 'vezz', 'vfb', 'vfp', 'vfx', 'vga', 'vhf', 'vhs', 'vi', 'via', 'vip', 'vis', 'vn', 'vo', 'voc', 'voip', 'vol', 'volg', 'voll', 'vor', 'vpdn', 'vpn', 'vr', 'vs', 'vsp', 'vt', 'vtc', 'vts', 'vtt', 'vv', 'vvf', 'wai', 'wais', 'wan', 'wap', 'wasp', 'wc', 'wcdma', 'wcm', 'wga', 'wi-fi', 'wipo', 'wisp', 'wll', 'wml', 'wms', 'worm', 'wp', 'wpan', 'wssn', 'wto', 'wwan', 'wwf', 'www', 'wygiwys', 'xl', 'xml', 'xs', 'xxl', 'xxs', 'yaf', 'yb', 'yci', 'yd', 'yd²', 'yd³', 'ymca', 'zat', 'zb', 'zcs', 'zdf', 'zdg', 'zift', 'zool', 'zoot', 'ztc', 'ztl', '°c', '°f', '°n', '°ra', '°ré', 'µg']
+        PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
+        NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
+        def all
+          ABBREVIATIONS
+        end
+        def prepositive
+          PREPOSITIVE_ABBREVIATIONS
+        end
+        def number
+          NUMBER_ABBREVIATIONS
+        end
+      end
+      class AbbreviationReplacer  < PragmaticSegmenter::AbbreviationReplacer
+        private
+        def abbreviations
+          PragmaticSegmenter::Languages::Italian::Abbreviation.new
+        end
+      end
+    end
+  end
+end

data/lib/pragmatic_segmenter/languages/japanese.rb ADDED Viewed

@@ -0,0 +1,58 @@
+module PragmaticSegmenter
+  module Languages
+    class Japanese
+      class Process < PragmaticSegmenter::Process
+        private
+        def between_punctutation(txt)
+          PragmaticSegmenter::Languages::Japanese::BetweenPunctuation.new(text: txt).replace
+        end
+      end
+      class Cleaner < PragmaticSegmenter::Cleaner
+        # Rubular: http://rubular.com/r/N4kPuJgle7
+        NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')
+        def clean
+          super
+          @clean_text = remove_newline_in_middle_of_word(@clean_text)
+        end
+        private
+        def remove_newline_in_middle_of_word(txt)
+          txt.apply(NewLineInMiddleOfWordRule)
+        end
+      end
+      class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
+        # Rubular: http://rubular.com/r/GnjOmry5Z2
+        BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/
+        # Rubular: http://rubular.com/r/EjHcZn5ZSG
+        BETWEEN_PARENS_JA_REGEX = /\u{ff08}(?>[^\u{ff08}\u{ff09}\\]+|\\{2}|\\.)*\u{ff09}/
+        private
+        def sub_punctuation_between_quotes_and_parens(txt)
+          super
+          sub_punctuation_between_parens_ja(txt)
+          sub_punctuation_between_quotes_ja(txt)
+        end
+        def sub_punctuation_between_quotes_ja(txt)
+          PragmaticSegmenter::PunctuationReplacer.new(
+            matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
+            text: txt
+          ).replace
+        end
+        def sub_punctuation_between_parens_ja(txt)
+          PragmaticSegmenter::PunctuationReplacer.new(
+            matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
+            text: txt
+          ).replace
+        end
+      end
+    end
+  end
+end

data/lib/pragmatic_segmenter/languages/persian.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module PragmaticSegmenter
+  module Languages
+    class Persian
+      class Process < PragmaticSegmenter::Process
+        private
+        def sentence_boundary_punctuation(txt)
+          PragmaticSegmenter::Languages::Persian::SentenceBoundaryPunctuation.new(text: txt).split
+        end
+        def replace_abbreviations(txt)
+          PragmaticSegmenter::Languages::Persian::AbbreviationReplacer.new(text: txt).replace
+        end
+      end
+      class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
+        SENTENCE_BOUNDARY = /.*?[:\.!\?؟]|.*?\z|.*?$/
+        ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
+        ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
+        def split
+          txt = replace_non_sentence_boundary_punctuation(text)
+          txt.scan(SENTENCE_BOUNDARY)
+        end
+        private
+        def replace_non_sentence_boundary_punctuation(txt)
+          txt.apply(ReplaceColonBetweenNumbersRule).
+              apply(ReplaceNonSentenceBoundaryCommaRule)
+        end
+      end
+      class Punctuation < PragmaticSegmenter::Punctuation
+        PUNCT = ['?', '!', ':', '.', '؟']
+        def punct
+          PUNCT
+        end
+      end
+      class AbbreviationReplacer  < PragmaticSegmenter::AbbreviationReplacer
+        private
+        def scan_for_replacements(txt, am, index, character_array, abbr)
+          replace_abbr(txt, am)
+        end
+        def replace_abbr(txt, abbr)
+          txt.gsub(/(?<=#{abbr})\./, '∯')
+        end
+      end
+    end
+  end
+end

data/lib/pragmatic_segmenter/languages/russian.rb ADDED Viewed

@@ -0,0 +1,60 @@
+module PragmaticSegmenter
+  module Languages
+    class Russian
+      class Process < PragmaticSegmenter::Process
+        private
+        def replace_abbreviations(txt)
+          PragmaticSegmenter::Languages::Russian::AbbreviationReplacer.new(text: txt).replace
+        end
+      end
+      class Abbreviation < PragmaticSegmenter::Abbreviation
+        ABBREVIATIONS = ['а', 'авт', 'адм.-терр', 'акад', 'в', 'вв', 'вкз', 'вост.-европ', 'г', 'гг', 'гос', 'гр', 'д', 'деп', 'дисс', 'дол', 'долл', 'ежедн', 'ж', 'жен', 'з', 'зап', 'зап.-европ', 'заруб', 'и', 'И', 'и', 'ин', 'иностр', 'инст', 'к', 'кв', 'К', 'Кв', 'куб', 'канд', 'кг', 'л', 'м', 'мин', 'моск', 'муж', 'нед', 'о', 'о', 'О', 'о', 'п', 'пер', 'пп', 'пр', 'просп', 'р', 'руб', 'с', 'сек', 'см', 'СПб', 'стр', 'т', 'т', 'тел', 'тов', 'тт', 'тыс', 'ул', 'у.е', 'y.e', 'у', 'y', 'Ф', 'ф', 'ч', 'пгт', 'проф', 'л.h', 'Л.Н', 'Н']
+        def all
+          ABBREVIATIONS
+        end
+        def prepositive
+          []
+        end
+        def number
+          []
+        end
+      end
+      class AbbreviationReplacer  < PragmaticSegmenter::AbbreviationReplacer
+        private
+        def scan_for_replacements(txt, am, index, character_array, abbr)
+          character = character_array[index]
+          prepositive = abbr.prepositive
+          number_abbr = abbr.number
+          upper = /[[:upper:]]/.match(character.to_s)
+          if upper.nil? || prepositive.include?(am.downcase.strip)
+            if prepositive.include?(am.downcase.strip)
+              txt = replace_prepositive_abbr(txt, am)
+            elsif number_abbr.include?(am.downcase.strip)
+              txt = replace_pre_number_abbr(txt, am)
+            else
+              txt = replace_period_of_abbr(txt, am)
+            end
+          end
+          txt
+        end
+        def abbreviations
+          PragmaticSegmenter::Languages::Russian::Abbreviation.new
+        end
+        def replace_period_of_abbr(txt, abbr)
+          txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
+            .gsub(/(?<=\A#{abbr.strip})\./, '∯')
+            .gsub(/(?<=^#{abbr.strip})\./, '∯')
+        end
+      end
+    end
+  end
+end

data/lib/pragmatic_segmenter/languages/spanish.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module PragmaticSegmenter
+  module Languages
+    class Spanish
+      class Process < PragmaticSegmenter::Process
+        private
+        def replace_abbreviations(txt)
+          PragmaticSegmenter::Languages::Spanish::AbbreviationReplacer.new(text: txt).replace
+        end
+      end
+      class Abbreviation < PragmaticSegmenter::Abbreviation
+        ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
+        PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
+        NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
+        def all
+          ABBREVIATIONS
+        end
+        def prepositive
+          PREPOSITIVE_ABBREVIATIONS
+        end
+        def number
+          NUMBER_ABBREVIATIONS
+        end
+      end
+      class AbbreviationReplacer  < PragmaticSegmenter::AbbreviationReplacer
+        private
+        def abbreviations
+          PragmaticSegmenter::Languages::Spanish::Abbreviation.new
+        end
+      end
+    end
+  end
+end

data/lib/pragmatic_segmenter/languages/urdu.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module PragmaticSegmenter
+  module Languages
+    class Urdu
+      class Process < PragmaticSegmenter::Process
+        private
+        def sentence_boundary_punctuation(txt)
+          PragmaticSegmenter::Languages::Urdu::SentenceBoundaryPunctuation.new(text: txt).split
+        end
+        def punctuation_array
+          PragmaticSegmenter::Languages::Urdu::Punctuation.new.punct
+        end
+      end
+      class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
+        SENTENCE_BOUNDARY = /.*?[۔؟!\?]|.*?$/
+        def split
+          text.scan(SENTENCE_BOUNDARY)
+        end
+      end
+      class Punctuation < PragmaticSegmenter::Punctuation
+        PUNCT = ['?', '!', '۔', '؟']
+        def punct
+          PUNCT
+        end
+      end
+    end
+  end
+end

data/lib/pragmatic_segmenter/list.rb ADDED Viewed

@@ -0,0 +1,169 @@
+# -*- encoding : utf-8 -*-
+module PragmaticSegmenter
+  # This class searches for a list within a string and adds
+  # newlines before each list item.
+  class List
+    # Rubular: http://rubular.com/r/XcpaJKH0sz
+    ALPHABETICAL_LIST_WITH_PERIODS =
+      /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/i
+    # Rubular: http://rubular.com/r/0MIlImeBsC
+    ALPHABETICAL_LIST_WITH_PARENS =
+      /(?<=^)[a-z](?=\))|(?<=\A)[a-z](?=\))|(?<=\s)[a-z](?=\))/i
+    SubstituteListPeriodRule = Rule.new(/♨/, '∯')
+    ListMarkerRule = Rule.new(/☝/, '')
+    # Rubular: http://rubular.com/r/Wv4qLdoPx7
+    SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d+♨)/, "\r")
+    # Rubular: http://rubular.com/r/AizHXC6HxK
+    SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d+♨)/, "\r")
+    # Rubular: http://rubular.com/r/GE5q6yID2j
+    SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d+☝)/, "\r")
+    NUMBERED_LIST_REGEX_1 =
+      /\s\d+(?=\.\s)|^\d+(?=\.\s)|\s\d+(?=\.\))|^\d+(?=\.\))|(?<=\s\-)\d+(?=\.\s)|(?<=^\-)\d+(?=\.\s)|(?<=\s\⁃)\d+(?=\.\s)|(?<=^\⁃)\d+(?=\.\s)|(?<=s\-)\d+(?=\.\))|(?<=^\-)\d+(?=\.\))|(?<=\s\⁃)\d+(?=\.\))|(?<=^\⁃)\d+(?=\.\))/
+    NUMBERED_LIST_REGEX_2 =
+      /(?<=\s)\d+\.(?=\s)|^\d+\.(?=\s)|(?<=\s)\d+\.(?=\))|^\d+\.(?=\))|(?<=\s\-)\d+\.(?=\s)|(?<=^\-)\d+\.(?=\s)|(?<=\s\⁃)\d+\.(?=\s)|(?<=^\⁃)\d+\.(?=\s)|(?<=\s\-)\d+\.(?=\))|(?<=^\-)\d+\.(?=\))|(?<=\s\⁃)\d+\.(?=\))|(?<=^\⁃)\d+\.(?=\))/
+    NUMBERED_LIST_PARENS_REGEX = /\d+(?=\)\s)/
+    # Rubular: http://rubular.com/r/0MIlImeBsC
+    EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
+      /(?<=^)[a-z](?=\))|(?<=\A)[a-z](?=\))|(?<=\s)[a-z](?=\))/i
+    # Rubular: http://rubular.com/r/wMpnVedEIb
+    ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
+      /(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i
+    attr_reader :text
+    def initialize(text:)
+      @text = Text.new(text)
+    end
+    def add_line_break
+      formatted_text = format_alphabetical_lists(text)
+      formatted_text = format_numbered_list_with_periods(formatted_text)
+      format_numbered_list_with_parens(formatted_text)
+    end
+    private
+    def format_numbered_list_with_parens(txt)
+      new_txt = replace_parens_in_numbered_list(txt)
+      new_txt = add_line_breaks_for_numbered_list_with_parens(new_txt)
+      new_txt.apply(ListMarkerRule)
+    end
+    def format_numbered_list_with_periods(txt)
+      new_txt = replace_periods_in_numbered_list(txt)
+      new_txt = add_line_breaks_for_numbered_list_with_periods(new_txt)
+      new_txt.apply(SubstituteListPeriodRule)
+    end
+    def format_alphabetical_lists(txt)
+      new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt)
+      add_line_breaks_for_alphabetical_list_with_parens(new_txt)
+    end
+    def replace_periods_in_numbered_list(txt)
+      scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', true, txt)
+    end
+    def add_line_breaks_for_numbered_list_with_periods(txt)
+      return txt unless txt.include?('♨') &&
+                        txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
+                        txt !~ /for\s\d+♨\s[a-z]/
+      txt.apply(SpaceBetweenListItemsFirstRule).
+          apply(SpaceBetweenListItemsSecondRule)
+    end
+    def replace_parens_in_numbered_list(txt)
+      scan_lists(
+        NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝', false, txt)
+    end
+    def add_line_breaks_for_numbered_list_with_parens(txt)
+      return txt unless txt.include?('☝') && txt !~ /☝.+\n.+☝|☝.+\r.+☝/
+      txt.apply(SpaceBetweenListItemsThirdRule)
+    end
+    def scan_lists(regex1, regex2, replacement, strip, txt)
+      list_array = txt.scan(regex1).map(&:to_i)
+      list_array.each_with_index do |a, i|
+        next unless (a + 1).eql?(list_array[i + 1]) ||
+                    (a - 1).eql?(list_array[i - 1]) ||
+                    (a.eql?(0) && list_array[i - 1].eql?(9)) ||
+                    (a.eql?(9) && list_array[i + 1].eql?(0))
+        substitute_found_list_items(txt, regex2, a, strip, replacement)
+      end
+      txt
+    end
+    def substitute_found_list_items(txt, regex, a, strip, replacement)
+      txt.gsub!(regex).with_index do |m|
+        if a.to_s.eql?(strip ? m.strip.chop : m)
+          "#{Regexp.escape(a.to_s)}" + replacement
+        else
+          "#{m}"
+        end
+      end
+    end
+    def add_line_breaks_for_alphabetical_list_with_periods(txt)
+      iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, false, txt)
+    end
+    def add_line_breaks_for_alphabetical_list_with_parens(txt)
+      iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, true, txt)
+    end
+    def replace_alphabet_list(a, txt)
+      txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
+        a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
+      end
+      txt
+    end
+    def replace_alphabet_list_parens(a, txt)
+      txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
+        a.eql?(m) ? "\r#{Regexp.escape(a.to_s)}" : "#{m}"
+      end
+      txt
+    end
+    def replace_correct_alphabet_list(a, txt, parens)
+      if parens
+        replace_alphabet_list_parens(a, txt)
+      else
+        replace_alphabet_list(a, txt)
+      end
+    end
+    def last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
+      return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
+      replace_correct_alphabet_list(a, txt, parens)
+    end
+    def other_items_replacement(a, i, alphabet, list_array, txt, parens)
+      return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
+                (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
+      replace_correct_alphabet_list(a, txt, parens)
+    end
+    def iterate_alphabet_array(regex, parens, txt)
+      list_array = txt.scan(regex).map(&:downcase)
+      alphabet = ('a'..'z').to_a
+      list_array.each_with_index do |a, i|
+        if i.eql?(list_array.length - 1)
+          last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
+        else
+          other_items_replacement(a, i, alphabet, list_array, txt, parens)
+        end
+      end
+      txt
+    end
+  end
+end