simplificator-babel 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/README.markdown +50 -7
- data/Rakefile +46 -0
- data/VERSION.yml +2 -2
- data/babel.gemspec +64 -0
- data/lib/babel/babel.rb +84 -23
- data/lib/babel/profile.rb +43 -25
- data/lib/babel/string_extensions.rb +22 -17
- data/lib/profiles/{profile_deu_1996.yml → profile_deu.yml} +2 -1
- data/lib/profiles/profile_eng.yml +2 -1
- data/lib/profiles/profile_fra.yml +2 -1
- data/lib/profiles/profile_ita.yml +22432 -0
- data/lib/profiles/profile_spa.yml +2 -1
- data/lib/profiles/udhr_txt.zip +0 -0
- data/samples/guessing.rb +28 -0
- data/test/babel_test.rb +3 -34
- data/test/string_extensions_test.rb +10 -8
- metadata +8 -361
- data/lib/data/udhr_txt/index.xml +0 -385
- data/lib/data/udhr_txt/udhr_007.txt +0 -220
- data/lib/data/udhr_txt/udhr_008.txt +0 -220
- data/lib/data/udhr_txt/udhr_009.txt +0 -228
- data/lib/data/udhr_txt/udhr_010.txt +0 -219
- data/lib/data/udhr_txt/udhr_011.txt +0 -232
- data/lib/data/udhr_txt/udhr_abk.txt +0 -218
- data/lib/data/udhr_txt/udhr_ace.txt +0 -221
- data/lib/data/udhr_txt/udhr_acu.txt +0 -222
- data/lib/data/udhr_txt/udhr_ada.txt +0 -220
- data/lib/data/udhr_txt/udhr_afr.txt +0 -219
- data/lib/data/udhr_txt/udhr_agr.txt +0 -219
- data/lib/data/udhr_txt/udhr_aii.txt +0 -216
- data/lib/data/udhr_txt/udhr_ajg.txt +0 -219
- data/lib/data/udhr_txt/udhr_aka_akuapem.txt +0 -221
- data/lib/data/udhr_txt/udhr_aka_asante.txt +0 -220
- data/lib/data/udhr_txt/udhr_aka_fante.txt +0 -219
- data/lib/data/udhr_txt/udhr_als.txt +0 -220
- data/lib/data/udhr_txt/udhr_amc.txt +0 -215
- data/lib/data/udhr_txt/udhr_ame.txt +0 -222
- data/lib/data/udhr_txt/udhr_amh.txt +0 -209
- data/lib/data/udhr_txt/udhr_amr.txt +0 -221
- data/lib/data/udhr_txt/udhr_arb.txt +0 -220
- data/lib/data/udhr_txt/udhr_arl.txt +0 -222
- data/lib/data/udhr_txt/udhr_arn.txt +0 -218
- data/lib/data/udhr_txt/udhr_ast.txt +0 -221
- data/lib/data/udhr_txt/udhr_auv.txt +0 -217
- data/lib/data/udhr_txt/udhr_ayr.txt +0 -218
- data/lib/data/udhr_txt/udhr_azj_cyrl.txt +0 -218
- data/lib/data/udhr_txt/udhr_azj_latn.txt +0 -218
- data/lib/data/udhr_txt/udhr_bam.txt +0 -218
- data/lib/data/udhr_txt/udhr_ban.txt +0 -222
- data/lib/data/udhr_txt/udhr_bba.txt +0 -218
- data/lib/data/udhr_txt/udhr_bci.txt +0 -217
- data/lib/data/udhr_txt/udhr_bcl.txt +0 -219
- data/lib/data/udhr_txt/udhr_bel.txt +0 -221
- data/lib/data/udhr_txt/udhr_bem.txt +0 -217
- data/lib/data/udhr_txt/udhr_ben.txt +0 -222
- data/lib/data/udhr_txt/udhr_bho.txt +0 -219
- data/lib/data/udhr_txt/udhr_bin.txt +0 -232
- data/lib/data/udhr_txt/udhr_bis.txt +0 -218
- data/lib/data/udhr_txt/udhr_blu.txt +0 -219
- data/lib/data/udhr_txt/udhr_boa.txt +0 -223
- data/lib/data/udhr_txt/udhr_bod.txt +0 -221
- data/lib/data/udhr_txt/udhr_bos_cyrl.txt +0 -220
- data/lib/data/udhr_txt/udhr_bos_latn.txt +0 -220
- data/lib/data/udhr_txt/udhr_bre.txt +0 -222
- data/lib/data/udhr_txt/udhr_btb.txt +0 -217
- data/lib/data/udhr_txt/udhr_bug.txt +0 -222
- data/lib/data/udhr_txt/udhr_bul.txt +0 -218
- data/lib/data/udhr_txt/udhr_cab.txt +0 -221
- data/lib/data/udhr_txt/udhr_cak.txt +0 -217
- data/lib/data/udhr_txt/udhr_cat.txt +0 -220
- data/lib/data/udhr_txt/udhr_cbr.txt +0 -219
- data/lib/data/udhr_txt/udhr_cbs.txt +0 -153
- data/lib/data/udhr_txt/udhr_cbt.txt +0 -220
- data/lib/data/udhr_txt/udhr_cbu.txt +0 -218
- data/lib/data/udhr_txt/udhr_ccx.txt +0 -222
- data/lib/data/udhr_txt/udhr_ceb.txt +0 -218
- data/lib/data/udhr_txt/udhr_ces.txt +0 -221
- data/lib/data/udhr_txt/udhr_cha.txt +0 -219
- data/lib/data/udhr_txt/udhr_chj.txt +0 -220
- data/lib/data/udhr_txt/udhr_chk.txt +0 -220
- data/lib/data/udhr_txt/udhr_chr.txt +0 -10
- data/lib/data/udhr_txt/udhr_cic.txt +0 -220
- data/lib/data/udhr_txt/udhr_cjk.txt +0 -218
- data/lib/data/udhr_txt/udhr_cjk_AO.txt +0 -220
- data/lib/data/udhr_txt/udhr_ckb.txt +0 -217
- data/lib/data/udhr_txt/udhr_cmn_hans.txt +0 -220
- data/lib/data/udhr_txt/udhr_cmn_hant.txt +0 -220
- data/lib/data/udhr_txt/udhr_cnh.txt +0 -220
- data/lib/data/udhr_txt/udhr_cni.txt +0 -220
- data/lib/data/udhr_txt/udhr_cos.txt +0 -218
- data/lib/data/udhr_txt/udhr_cot.txt +0 -222
- data/lib/data/udhr_txt/udhr_cpu.txt +0 -219
- data/lib/data/udhr_txt/udhr_crs.txt +0 -217
- data/lib/data/udhr_txt/udhr_csa.txt +0 -223
- data/lib/data/udhr_txt/udhr_csw.txt +0 -163
- data/lib/data/udhr_txt/udhr_ctd.txt +0 -222
- data/lib/data/udhr_txt/udhr_cym.txt +0 -222
- data/lib/data/udhr_txt/udhr_dag.txt +0 -217
- data/lib/data/udhr_txt/udhr_dan.txt +0 -224
- data/lib/data/udhr_txt/udhr_ddn.txt +0 -217
- data/lib/data/udhr_txt/udhr_deu_1901.txt +0 -220
- data/lib/data/udhr_txt/udhr_deu_1996.txt +0 -220
- data/lib/data/udhr_txt/udhr_dga.txt +0 -220
- data/lib/data/udhr_txt/udhr_dip.txt +0 -217
- data/lib/data/udhr_txt/udhr_div.txt +0 -220
- data/lib/data/udhr_txt/udhr_dyo.txt +0 -217
- data/lib/data/udhr_txt/udhr_dzo.txt +0 -9
- data/lib/data/udhr_txt/udhr_ell_monotonic.txt +0 -220
- data/lib/data/udhr_txt/udhr_ell_polytonic.txt +0 -220
- data/lib/data/udhr_txt/udhr_emk.txt +0 -218
- data/lib/data/udhr_txt/udhr_eml.txt +0 -219
- data/lib/data/udhr_txt/udhr_eng.txt +0 -219
- data/lib/data/udhr_txt/udhr_epo.txt +0 -221
- data/lib/data/udhr_txt/udhr_est.txt +0 -219
- data/lib/data/udhr_txt/udhr_eus.txt +0 -220
- data/lib/data/udhr_txt/udhr_eve.txt +0 -207
- data/lib/data/udhr_txt/udhr_ewe.txt +0 -218
- data/lib/data/udhr_txt/udhr_fao.txt +0 -219
- data/lib/data/udhr_txt/udhr_fij.txt +0 -224
- data/lib/data/udhr_txt/udhr_fin.txt +0 -224
- data/lib/data/udhr_txt/udhr_flm.txt +0 -219
- data/lib/data/udhr_txt/udhr_fon.txt +0 -217
- data/lib/data/udhr_txt/udhr_fra.txt +0 -218
- data/lib/data/udhr_txt/udhr_fri.txt +0 -219
- data/lib/data/udhr_txt/udhr_fuc.txt +0 -217
- data/lib/data/udhr_txt/udhr_fur.txt +0 -220
- data/lib/data/udhr_txt/udhr_gaa.txt +0 -220
- data/lib/data/udhr_txt/udhr_gag.txt +0 -223
- data/lib/data/udhr_txt/udhr_gax.txt +0 -222
- data/lib/data/udhr_txt/udhr_gjn.txt +0 -220
- data/lib/data/udhr_txt/udhr_gkp.txt +0 -216
- data/lib/data/udhr_txt/udhr_gla.txt +0 -229
- data/lib/data/udhr_txt/udhr_gle.txt +0 -215
- data/lib/data/udhr_txt/udhr_glg.txt +0 -217
- data/lib/data/udhr_txt/udhr_guc.txt +0 -221
- data/lib/data/udhr_txt/udhr_gug.txt +0 -210
- data/lib/data/udhr_txt/udhr_guj.txt +0 -219
- data/lib/data/udhr_txt/udhr_gyr.txt +0 -203
- data/lib/data/udhr_txt/udhr_hat_kreyol.txt +0 -221
- data/lib/data/udhr_txt/udhr_hat_popular.txt +0 -221
- data/lib/data/udhr_txt/udhr_hau_NE.txt +0 -219
- data/lib/data/udhr_txt/udhr_hau_NG.txt +0 -219
- data/lib/data/udhr_txt/udhr_haw.txt +0 -219
- data/lib/data/udhr_txt/udhr_hea.txt +0 -219
- data/lib/data/udhr_txt/udhr_heb.txt +0 -216
- data/lib/data/udhr_txt/udhr_hil.txt +0 -217
- data/lib/data/udhr_txt/udhr_hin.txt +0 -222
- data/lib/data/udhr_txt/udhr_hms.txt +0 -219
- data/lib/data/udhr_txt/udhr_hna.txt +0 -217
- data/lib/data/udhr_txt/udhr_hni.txt +0 -218
- data/lib/data/udhr_txt/udhr_hrv.txt +0 -218
- data/lib/data/udhr_txt/udhr_hsb.txt +0 -220
- data/lib/data/udhr_txt/udhr_hun.txt +0 -218
- data/lib/data/udhr_txt/udhr_hus.txt +0 -222
- data/lib/data/udhr_txt/udhr_huu.txt +0 -220
- data/lib/data/udhr_txt/udhr_hva.txt +0 -220
- data/lib/data/udhr_txt/udhr_hye.txt +0 -234
- data/lib/data/udhr_txt/udhr_ibb.txt +0 -235
- data/lib/data/udhr_txt/udhr_ibo.txt +0 -219
- data/lib/data/udhr_txt/udhr_ido.txt +0 -224
- data/lib/data/udhr_txt/udhr_iii.txt +0 -9
- data/lib/data/udhr_txt/udhr_ike.txt +0 -163
- data/lib/data/udhr_txt/udhr_ilo.txt +0 -217
- data/lib/data/udhr_txt/udhr_ina.txt +0 -220
- data/lib/data/udhr_txt/udhr_ind.txt +0 -219
- data/lib/data/udhr_txt/udhr_isl.txt +0 -217
- data/lib/data/udhr_txt/udhr_ita.txt +0 -221
- data/lib/data/udhr_txt/udhr_jav.txt +0 -222
- data/lib/data/udhr_txt/udhr_jpn.txt +0 -219
- data/lib/data/udhr_txt/udhr_kal.txt +0 -218
- data/lib/data/udhr_txt/udhr_kan.txt +0 -216
- data/lib/data/udhr_txt/udhr_kat.txt +0 -221
- data/lib/data/udhr_txt/udhr_kaz.txt +0 -218
- data/lib/data/udhr_txt/udhr_kbp.txt +0 -218
- data/lib/data/udhr_txt/udhr_kde.txt +0 -212
- data/lib/data/udhr_txt/udhr_kea.txt +0 -219
- data/lib/data/udhr_txt/udhr_kek.txt +0 -219
- data/lib/data/udhr_txt/udhr_khk.txt +0 -217
- data/lib/data/udhr_txt/udhr_khk_mong.txt +0 -11
- data/lib/data/udhr_txt/udhr_khm.txt +0 -220
- data/lib/data/udhr_txt/udhr_kin.txt +0 -220
- data/lib/data/udhr_txt/udhr_kir.txt +0 -220
- data/lib/data/udhr_txt/udhr_kmb.txt +0 -219
- data/lib/data/udhr_txt/udhr_knc.txt +0 -230
- data/lib/data/udhr_txt/udhr_kng.txt +0 -219
- data/lib/data/udhr_txt/udhr_kng_AO.txt +0 -219
- data/lib/data/udhr_txt/udhr_koo.txt +0 -216
- data/lib/data/udhr_txt/udhr_kor.txt +0 -219
- data/lib/data/udhr_txt/udhr_kqn.txt +0 -218
- data/lib/data/udhr_txt/udhr_kri.txt +0 -226
- data/lib/data/udhr_txt/udhr_ktu.txt +0 -219
- data/lib/data/udhr_txt/udhr_lao.txt +0 -223
- data/lib/data/udhr_txt/udhr_lat.txt +0 -221
- data/lib/data/udhr_txt/udhr_lat_1.txt +0 -220
- data/lib/data/udhr_txt/udhr_lav.txt +0 -220
- data/lib/data/udhr_txt/udhr_lia.txt +0 -218
- data/lib/data/udhr_txt/udhr_lin.txt +0 -217
- data/lib/data/udhr_txt/udhr_lin_tones.txt +0 -214
- data/lib/data/udhr_txt/udhr_lit.txt +0 -218
- data/lib/data/udhr_txt/udhr_lnc.txt +0 -219
- data/lib/data/udhr_txt/udhr_lns.txt +0 -219
- data/lib/data/udhr_txt/udhr_loz.txt +0 -219
- data/lib/data/udhr_txt/udhr_ltz.txt +0 -218
- data/lib/data/udhr_txt/udhr_lua.txt +0 -219
- data/lib/data/udhr_txt/udhr_lue.txt +0 -217
- data/lib/data/udhr_txt/udhr_lug.txt +0 -216
- data/lib/data/udhr_txt/udhr_lun.txt +0 -216
- data/lib/data/udhr_txt/udhr_mad.txt +0 -223
- data/lib/data/udhr_txt/udhr_mag.txt +0 -220
- data/lib/data/udhr_txt/udhr_mah.txt +0 -220
- data/lib/data/udhr_txt/udhr_mai.txt +0 -223
- data/lib/data/udhr_txt/udhr_mal.txt +0 -210
- data/lib/data/udhr_txt/udhr_mam.txt +0 -218
- data/lib/data/udhr_txt/udhr_mar.txt +0 -219
- data/lib/data/udhr_txt/udhr_maz.txt +0 -218
- data/lib/data/udhr_txt/udhr_mcd.txt +0 -220
- data/lib/data/udhr_txt/udhr_mcf.txt +0 -223
- data/lib/data/udhr_txt/udhr_men.txt +0 -222
- data/lib/data/udhr_txt/udhr_mic.txt +0 -218
- data/lib/data/udhr_txt/udhr_min.txt +0 -221
- data/lib/data/udhr_txt/udhr_miq.txt +0 -213
- data/lib/data/udhr_txt/udhr_mkd.txt +0 -221
- data/lib/data/udhr_txt/udhr_mlt.txt +0 -217
- data/lib/data/udhr_txt/udhr_mly_arab.txt +0 -219
- data/lib/data/udhr_txt/udhr_mly_latn.txt +0 -218
- data/lib/data/udhr_txt/udhr_mos.txt +0 -216
- data/lib/data/udhr_txt/udhr_mri.txt +0 -219
- data/lib/data/udhr_txt/udhr_mxi.txt +0 -218
- data/lib/data/udhr_txt/udhr_mxv.txt +0 -223
- data/lib/data/udhr_txt/udhr_mya.txt +0 -219
- data/lib/data/udhr_txt/udhr_mzi.txt +0 -227
- data/lib/data/udhr_txt/udhr_nav.txt +0 -219
- data/lib/data/udhr_txt/udhr_nba.txt +0 -257
- data/lib/data/udhr_txt/udhr_nbl.txt +0 -218
- data/lib/data/udhr_txt/udhr_ndo.txt +0 -217
- data/lib/data/udhr_txt/udhr_nep.txt +0 -214
- data/lib/data/udhr_txt/udhr_nhn.txt +0 -221
- data/lib/data/udhr_txt/udhr_nld.txt +0 -217
- data/lib/data/udhr_txt/udhr_nno.txt +0 -219
- data/lib/data/udhr_txt/udhr_nob.txt +0 -225
- data/lib/data/udhr_txt/udhr_not.txt +0 -218
- data/lib/data/udhr_txt/udhr_nso.txt +0 -219
- data/lib/data/udhr_txt/udhr_nya_chechewa.txt +0 -221
- data/lib/data/udhr_txt/udhr_nya_chinyanja.txt +0 -218
- data/lib/data/udhr_txt/udhr_nym.txt +0 -229
- data/lib/data/udhr_txt/udhr_nyn.txt +0 -213
- data/lib/data/udhr_txt/udhr_nzi.txt +0 -221
- data/lib/data/udhr_txt/udhr_ojb.txt +0 -221
- data/lib/data/udhr_txt/udhr_oss.txt +0 -214
- data/lib/data/udhr_txt/udhr_ote.txt +0 -218
- data/lib/data/udhr_txt/udhr_pam.txt +0 -225
- data/lib/data/udhr_txt/udhr_pan.txt +0 -227
- data/lib/data/udhr_txt/udhr_pau.txt +0 -219
- data/lib/data/udhr_txt/udhr_pbb.txt +0 -218
- data/lib/data/udhr_txt/udhr_pbu.txt +0 -9
- data/lib/data/udhr_txt/udhr_pcd.txt +0 -218
- data/lib/data/udhr_txt/udhr_pcm.txt +0 -218
- data/lib/data/udhr_txt/udhr_pes_1.txt +0 -218
- data/lib/data/udhr_txt/udhr_pes_2.txt +0 -222
- data/lib/data/udhr_txt/udhr_pis.txt +0 -219
- data/lib/data/udhr_txt/udhr_plt.txt +0 -214
- data/lib/data/udhr_txt/udhr_pnb.txt +0 -223
- data/lib/data/udhr_txt/udhr_pol.txt +0 -220
- data/lib/data/udhr_txt/udhr_pon.txt +0 -218
- data/lib/data/udhr_txt/udhr_por_BR.txt +0 -231
- data/lib/data/udhr_txt/udhr_por_PT.txt +0 -219
- data/lib/data/udhr_txt/udhr_pov.txt +0 -220
- data/lib/data/udhr_txt/udhr_ppl.txt +0 -219
- data/lib/data/udhr_txt/udhr_prq.txt +0 -151
- data/lib/data/udhr_txt/udhr_prv.txt +0 -207
- data/lib/data/udhr_txt/udhr_quc.txt +0 -217
- data/lib/data/udhr_txt/udhr_qud.txt +0 -218
- data/lib/data/udhr_txt/udhr_quy.txt +0 -221
- data/lib/data/udhr_txt/udhr_quz.txt +0 -223
- data/lib/data/udhr_txt/udhr_qva.txt +0 -219
- data/lib/data/udhr_txt/udhr_qvc.txt +0 -218
- data/lib/data/udhr_txt/udhr_qvh.txt +0 -217
- data/lib/data/udhr_txt/udhr_qvm.txt +0 -219
- data/lib/data/udhr_txt/udhr_qvn.txt +0 -217
- data/lib/data/udhr_txt/udhr_qwh.txt +0 -218
- data/lib/data/udhr_txt/udhr_qxa.txt +0 -217
- data/lib/data/udhr_txt/udhr_qxn.txt +0 -216
- data/lib/data/udhr_txt/udhr_qxu.txt +0 -221
- data/lib/data/udhr_txt/udhr_rar.txt +0 -220
- data/lib/data/udhr_txt/udhr_rmn.txt +0 -220
- data/lib/data/udhr_txt/udhr_rmn_1.txt +0 -221
- data/lib/data/udhr_txt/udhr_rmy.txt +0 -218
- data/lib/data/udhr_txt/udhr_roh.txt +0 -217
- data/lib/data/udhr_txt/udhr_ron_1953.txt +0 -218
- data/lib/data/udhr_txt/udhr_ron_1993.txt +0 -218
- data/lib/data/udhr_txt/udhr_ron_2006.txt +0 -218
- data/lib/data/udhr_txt/udhr_run.txt +0 -218
- data/lib/data/udhr_txt/udhr_rus.txt +0 -220
- data/lib/data/udhr_txt/udhr_sag.txt +0 -220
- data/lib/data/udhr_txt/udhr_san.txt +0 -219
- data/lib/data/udhr_txt/udhr_sco.txt +0 -222
- data/lib/data/udhr_txt/udhr_shp.txt +0 -224
- data/lib/data/udhr_txt/udhr_skr.txt +0 -225
- data/lib/data/udhr_txt/udhr_slk.txt +0 -219
- data/lib/data/udhr_txt/udhr_slv.txt +0 -218
- data/lib/data/udhr_txt/udhr_sme.txt +0 -220
- data/lib/data/udhr_txt/udhr_smo.txt +0 -226
- data/lib/data/udhr_txt/udhr_sna.txt +0 -223
- data/lib/data/udhr_txt/udhr_snk.txt +0 -220
- data/lib/data/udhr_txt/udhr_som.txt +0 -216
- data/lib/data/udhr_txt/udhr_sot.txt +0 -220
- data/lib/data/udhr_txt/udhr_spa.txt +0 -220
- data/lib/data/udhr_txt/udhr_src.txt +0 -220
- data/lib/data/udhr_txt/udhr_srp_cyrl.txt +0 -218
- data/lib/data/udhr_txt/udhr_srp_latn.txt +0 -218
- data/lib/data/udhr_txt/udhr_srr.txt +0 -219
- data/lib/data/udhr_txt/udhr_ssw.txt +0 -228
- data/lib/data/udhr_txt/udhr_suk.txt +0 -218
- data/lib/data/udhr_txt/udhr_sun.txt +0 -227
- data/lib/data/udhr_txt/udhr_sus.txt +0 -218
- data/lib/data/udhr_txt/udhr_swe.txt +0 -224
- data/lib/data/udhr_txt/udhr_swh.txt +0 -221
- data/lib/data/udhr_txt/udhr_tah.txt +0 -217
- data/lib/data/udhr_txt/udhr_taj.txt +0 -10
- data/lib/data/udhr_txt/udhr_tam.txt +0 -227
- data/lib/data/udhr_txt/udhr_tat.txt +0 -219
- data/lib/data/udhr_txt/udhr_tbz.txt +0 -219
- data/lib/data/udhr_txt/udhr_tca.txt +0 -219
- data/lib/data/udhr_txt/udhr_tem.txt +0 -216
- data/lib/data/udhr_txt/udhr_tet.txt +0 -219
- data/lib/data/udhr_txt/udhr_tgk.txt +0 -217
- data/lib/data/udhr_txt/udhr_tgl.txt +0 -224
- data/lib/data/udhr_txt/udhr_tgl_tglg.txt +0 -9
- data/lib/data/udhr_txt/udhr_tha.txt +0 -217
- data/lib/data/udhr_txt/udhr_tir.txt +0 -217
- data/lib/data/udhr_txt/udhr_tiv.txt +0 -232
- data/lib/data/udhr_txt/udhr_tob.txt +0 -218
- data/lib/data/udhr_txt/udhr_toi.txt +0 -216
- data/lib/data/udhr_txt/udhr_toj.txt +0 -219
- data/lib/data/udhr_txt/udhr_ton.txt +0 -221
- data/lib/data/udhr_txt/udhr_top.txt +0 -220
- data/lib/data/udhr_txt/udhr_tpi.txt +0 -219
- data/lib/data/udhr_txt/udhr_tsn.txt +0 -219
- data/lib/data/udhr_txt/udhr_tso_MZ.txt +0 -220
- data/lib/data/udhr_txt/udhr_tsz.txt +0 -218
- data/lib/data/udhr_txt/udhr_tuk_cyrl.txt +0 -216
- data/lib/data/udhr_txt/udhr_tuk_latn.txt +0 -221
- data/lib/data/udhr_txt/udhr_tur.txt +0 -219
- data/lib/data/udhr_txt/udhr_tzc.txt +0 -219
- data/lib/data/udhr_txt/udhr_tzh.txt +0 -218
- data/lib/data/udhr_txt/udhr_tzm.txt +0 -220
- data/lib/data/udhr_txt/udhr_tzm_tfng.txt +0 -9
- data/lib/data/udhr_txt/udhr_uig_arab.txt +0 -219
- data/lib/data/udhr_txt/udhr_uig_latn.txt +0 -219
- data/lib/data/udhr_txt/udhr_ukr.txt +0 -218
- data/lib/data/udhr_txt/udhr_umb.txt +0 -218
- data/lib/data/udhr_txt/udhr_ura.txt +0 -219
- data/lib/data/udhr_txt/udhr_urd.txt +0 -9
- data/lib/data/udhr_txt/udhr_uzn_cyrl.txt +0 -220
- data/lib/data/udhr_txt/udhr_uzn_latn.txt +0 -220
- data/lib/data/udhr_txt/udhr_vai.txt +0 -224
- data/lib/data/udhr_txt/udhr_vie.txt +0 -221
- data/lib/data/udhr_txt/udhr_vmw.txt +0 -220
- data/lib/data/udhr_txt/udhr_war.txt +0 -219
- data/lib/data/udhr_txt/udhr_wln.txt +0 -220
- data/lib/data/udhr_txt/udhr_wol.txt +0 -219
- data/lib/data/udhr_txt/udhr_wwa.txt +0 -109
- data/lib/data/udhr_txt/udhr_xho.txt +0 -219
- data/lib/data/udhr_txt/udhr_xsm.txt +0 -219
- data/lib/data/udhr_txt/udhr_yad.txt +0 -220
- data/lib/data/udhr_txt/udhr_yao.txt +0 -214
- data/lib/data/udhr_txt/udhr_yap.txt +0 -220
- data/lib/data/udhr_txt/udhr_ydd.txt +0 -223
- data/lib/data/udhr_txt/udhr_ykg.txt +0 -211
- data/lib/data/udhr_txt/udhr_yor.txt +0 -218
- data/lib/data/udhr_txt/udhr_yua.txt +0 -218
- data/lib/data/udhr_txt/udhr_zam.txt +0 -223
- data/lib/data/udhr_txt/udhr_ztu.txt +0 -219
- data/lib/data/udhr_txt/udhr_zul.txt +0 -219
- data/test/train.rb +0 -26
data/.document
ADDED
data/.gitignore
ADDED
data/README.markdown
CHANGED
@@ -1,18 +1,61 @@
|
|
1
1
|
#babel
|
2
2
|
|
3
3
|
Babel is a gem to identify in what language a text is written.
|
4
|
-
It is based on the n-gram approach by
|
4
|
+
It is based on the n-gram approach by Cavnar and Trenkle as described
|
5
|
+
in http://www.sfs.uni-tuebingen.de/iscl/Theses/kranig.pdf
|
5
6
|
|
6
7
|
|
7
8
|
##usage
|
8
9
|
require 'rubygems'
|
9
|
-
require '
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
require 'babel'
|
11
|
+
|
12
|
+
def guess_language(s)
|
13
|
+
puts "'#{s}' is probably '#{s.language}'"
|
14
|
+
end
|
15
|
+
# load the default profiles
|
16
|
+
Babel.load_profiles
|
17
|
+
|
18
|
+
# Let's see what Babel thinks about these texts
|
19
|
+
guess_language 'Montags ist es ruhig'
|
20
|
+
guess_language 'le coq est mort'
|
14
21
|
|
15
|
-
|
22
|
+
# Replace a profile with my own profile
|
23
|
+
Babel.load_profile('eng', '/path/to/my/english/profile.yml')
|
24
|
+
|
25
|
+
# Merge profile data
|
26
|
+
Babel.load_profile('eng', '/path/to/my/other/english/profile.yml', :merge => true)
|
27
|
+
|
28
|
+
# Show Top-3 Languages for a sentence
|
29
|
+
puts "What language could this be written in?".languages[0..2]
|
30
|
+
|
31
|
+
##profiles
|
32
|
+
Profiles are collections of n-grams and the number of occurence of each ngram.
|
33
|
+
Babel uses n-grams with length 2-5 (bigram, trigram, tetragram, pentagram).
|
34
|
+
You can create your own profile and decide what n-grams to use and whether
|
35
|
+
you want to limit or not if you want to.
|
36
|
+
|
37
|
+
These profiles are shipped with the gem:
|
38
|
+
* german (deu) (this profile is built from udhr_deu_1996.txt)
|
39
|
+
* english (eng)
|
40
|
+
* french (fra)
|
41
|
+
* spanish (spa)
|
42
|
+
* italian (ita)
|
43
|
+
|
44
|
+
Want another profile built in? Send an email to info@simplificator.com and if there are enough
|
45
|
+
requests we add the profile.
|
46
|
+
|
47
|
+
The profiles that are shipped with babel are based on the texts found at
|
48
|
+
http://www.unicode.org/udhr/index_by_code.html
|
49
|
+
|
50
|
+
##generating profiles
|
51
|
+
Profiles can be generated with the data found in http://www.unicode.org/udhr/assemblies/udhr_txt.zip or with any other text.
|
52
|
+
Once a profile is generated, Babel can store it in YAML format and load it again from YAML.
|
53
|
+
|
54
|
+
there is a rake task which simplifies profile generation:
|
55
|
+
rake babel:build_profile lang=foo file=myfile.txt dir=destination-directory
|
56
|
+
|
57
|
+
the file which is generated from this command can be loaded by
|
58
|
+
Babel.load_profile 'foo', 'profile_foo.yml'
|
16
59
|
|
17
60
|
##Copyright
|
18
61
|
|
data/Rakefile
CHANGED
@@ -10,6 +10,9 @@ begin
|
|
10
10
|
gem.homepage = "http://github.com/simplificator/babel"
|
11
11
|
gem.authors = ["simplificator"]
|
12
12
|
gem.add_dependency('ya2yaml', '>= 0.2.6')
|
13
|
+
gem.files.exclude 'lib/data'
|
14
|
+
#gem.files.exclude 'lib/data/*.xml'
|
15
|
+
gem.files.include 'lib/data/*.zip'
|
13
16
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
17
|
end
|
15
18
|
rescue LoadError
|
@@ -53,4 +56,47 @@ Rake::RDocTask.new do |rdoc|
|
|
53
56
|
rdoc.rdoc_files.include('README*')
|
54
57
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
58
|
end
|
59
|
+
require 'rubygems'
|
60
|
+
require 'zip/zip'
|
61
|
+
require 'lib/babel'
|
62
|
+
|
63
|
+
namespace :babel do
|
64
|
+
task :unpack_data do
|
65
|
+
dir = File.join(File.dirname(__FILE__), 'lib', 'data')
|
66
|
+
file = File.join(dir, 'udhr_txt.zip')
|
67
|
+
Zip::ZipFile.open(file) do |zip|
|
68
|
+
zip.each do |entry|
|
69
|
+
destination = File.join(dir, entry.name)
|
70
|
+
FileUtils.mkdir_p(File.dirname(destination))
|
71
|
+
FileUtils.rm(destination) if File.exists?(destination)
|
72
|
+
zip.extract(entry, destination)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
FileUtils.cp(File.join(dir, 'udhr_deu_1996.txt'), File.join(dir, 'udhr_deu.txt'))
|
76
|
+
end
|
77
|
+
|
78
|
+
task :build_profile do
|
79
|
+
if ENV['lang']
|
80
|
+
lang = ENV['lang']
|
81
|
+
file = ENV['file']
|
82
|
+
dir = ENV['dir'] || File.dirname(__FILE__)
|
83
|
+
skip = ENV['skip']
|
84
|
+
limit = ENV['limit']
|
85
|
+
unless file
|
86
|
+
skip ||= 5 # skip header in data files. english all the time
|
87
|
+
file = File.join(File.dirname(__FILE__), 'lib', 'data', "udhr_#{lang}.txt")
|
88
|
+
end
|
89
|
+
puts "Learning about #{lang} from #{file} and save it to #{dir}"
|
90
|
+
File.open(file, 'r') do |f|
|
91
|
+
f.each_with_index do |line, index|
|
92
|
+
if index > skip
|
93
|
+
Babel.learn(lang, line)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
Babel.save_profile(lang, :dir => dir, :limit => limit)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
56
102
|
|
data/VERSION.yml
CHANGED
data/babel.gemspec
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{babel}
|
5
|
+
s.version = "0.1.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["simplificator"]
|
9
|
+
s.date = %q{2009-07-13}
|
10
|
+
s.email = %q{info@simplificator.com}
|
11
|
+
s.extra_rdoc_files = [
|
12
|
+
"LICENSE",
|
13
|
+
"README.markdown"
|
14
|
+
]
|
15
|
+
s.files = [
|
16
|
+
".document",
|
17
|
+
".gitignore",
|
18
|
+
"LICENSE",
|
19
|
+
"README.markdown",
|
20
|
+
"Rakefile",
|
21
|
+
"VERSION.yml",
|
22
|
+
"babel.gemspec",
|
23
|
+
"lib/babel.rb",
|
24
|
+
"lib/babel/babel.rb",
|
25
|
+
"lib/babel/profile.rb",
|
26
|
+
"lib/babel/string_extensions.rb",
|
27
|
+
"lib/profiles/profile_deu.yml",
|
28
|
+
"lib/profiles/profile_eng.yml",
|
29
|
+
"lib/profiles/profile_fra.yml",
|
30
|
+
"lib/profiles/profile_ita.yml",
|
31
|
+
"lib/profiles/profile_spa.yml",
|
32
|
+
"lib/profiles/udhr_txt.zip",
|
33
|
+
"samples/guessing.rb",
|
34
|
+
"test/babel_test.rb",
|
35
|
+
"test/profile_test.rb",
|
36
|
+
"test/string_extensions_test.rb",
|
37
|
+
"test/test_helper.rb"
|
38
|
+
]
|
39
|
+
s.has_rdoc = true
|
40
|
+
s.homepage = %q{http://github.com/simplificator/babel}
|
41
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
42
|
+
s.require_paths = ["lib"]
|
43
|
+
s.rubygems_version = %q{1.3.2}
|
44
|
+
s.summary = %q{Utility to guess the language of a text}
|
45
|
+
s.test_files = [
|
46
|
+
"test/babel_test.rb",
|
47
|
+
"test/profile_test.rb",
|
48
|
+
"test/string_extensions_test.rb",
|
49
|
+
"test/test_helper.rb"
|
50
|
+
]
|
51
|
+
|
52
|
+
if s.respond_to? :specification_version then
|
53
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
54
|
+
s.specification_version = 3
|
55
|
+
|
56
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
57
|
+
s.add_runtime_dependency(%q<ya2yaml>, [">= 0.2.6"])
|
58
|
+
else
|
59
|
+
s.add_dependency(%q<ya2yaml>, [">= 0.2.6"])
|
60
|
+
end
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<ya2yaml>, [">= 0.2.6"])
|
63
|
+
end
|
64
|
+
end
|
data/lib/babel/babel.rb
CHANGED
@@ -1,54 +1,115 @@
|
|
1
|
+
#
|
2
|
+
#
|
3
|
+
# Profile Generation:
|
4
|
+
# Whenever it's about generating a Profile (Babel.learn, Babel.distances and Babel.guess)
|
5
|
+
# you can pass
|
6
|
+
# * :min_length (2)
|
7
|
+
# * :max_length (5)
|
8
|
+
# * :pad (true)
|
9
|
+
# They are just forwared to String.n_grams (default values in braces)
|
10
|
+
# It's highly recomended that you use the same settings for learning and guessing....
|
11
|
+
|
12
|
+
|
13
|
+
|
1
14
|
module Babel
|
2
15
|
@profiles = {}
|
3
16
|
PROFILE_DIR = File.join(File.dirname(__FILE__), '..', 'profiles')
|
17
|
+
|
18
|
+
# Learn that a text is in a given language.
|
19
|
+
# Calls Profile.learn for the profile with the given language.
|
4
20
|
def self.learn(lang, text, options = {})
|
5
21
|
lang = lang.to_s
|
6
|
-
profile = @profiles[lang] ||= Profile.new()
|
22
|
+
profile = @profiles[lang] ||= Profile.new(lang)
|
7
23
|
profile.learn(text, options)
|
8
24
|
end
|
9
25
|
|
10
|
-
|
26
|
+
# Clear all the profiles
|
11
27
|
def self.clear_profiles
|
12
28
|
@profiles = {}
|
13
29
|
end
|
30
|
+
# find the profile for a language
|
31
|
+
def self.profile(lang)
|
32
|
+
@profiles[lang]
|
33
|
+
end
|
14
34
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
35
|
+
# register a profile
|
36
|
+
# pass :merge => true to merge into an existing profile
|
37
|
+
def self.register_profile(profile, options = {})
|
38
|
+
if options[:merge] && @profiles[profile.language]
|
39
|
+
@profiles[profile.language].merge(profile)
|
40
|
+
else
|
41
|
+
@profiles[profile.language] = profile
|
19
42
|
end
|
20
|
-
found.first if found
|
21
43
|
end
|
22
44
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
45
|
+
# Guess the language of a text.
|
46
|
+
# As soon as there is at least one profile, this method always
|
47
|
+
# returns a value (perhaps the wrong) one...
|
48
|
+
# I.e. if only "eng" profile is registered, then this method will always retun "eng"
|
49
|
+
# not matter what text pass
|
50
|
+
#
|
51
|
+
def self.guess(source, options = {})
|
52
|
+
distances = Babel.distances(source, options)
|
53
|
+
distances.first.first if distances.first
|
27
54
|
end
|
28
|
-
|
29
|
-
private
|
30
55
|
|
31
|
-
|
32
|
-
|
56
|
+
# An array of arrays of [language, distance] arrays.
|
57
|
+
# The language with the shortest distance is the most probable solution.
|
58
|
+
# Sorted by distance, ascending (first item is most probable)
|
59
|
+
def self.distances(text, options = {})
|
60
|
+
source = Profile.new.learn(text, options)
|
61
|
+
@profiles.map { |lang, target| [lang, source.distance(target)] }.sort {|o1, o2| o1.last <=> o2.last}
|
33
62
|
end
|
34
63
|
|
35
|
-
|
64
|
+
|
65
|
+
# Load all the profiles from a given directory.
|
66
|
+
# Loads all .yml files so be careful what directory you specify.
|
67
|
+
# options are:
|
68
|
+
# * :dir the directory, defaults to Babel::PROFILE_DIR
|
69
|
+
# See Babel.load_profile() for other options
|
36
70
|
def self.load_profiles(options = {})
|
37
71
|
dir = options[:directory] || PROFILE_DIR
|
38
72
|
Dir[File.join(PROFILE_DIR, '*.yml')].each do |file|
|
39
|
-
file
|
40
|
-
@profiles[$1] = YAML.load_file(file)
|
73
|
+
Babel.load_profile(file, options)
|
41
74
|
end
|
42
75
|
end
|
43
76
|
|
77
|
+
# Load a single profile
|
78
|
+
# Options are:
|
79
|
+
# * :merge see Babel.register_profile for details
|
80
|
+
def self.load_profile(file, options = {})
|
81
|
+
Babel.register_profile(YAML.load_file(file), options)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Save the profiles to a specifified directory.
|
85
|
+
# See Babel.save_profile() for options
|
44
86
|
def self.save_profiles(options = {})
|
45
|
-
dir = options[:directory] || PROFILE_DIR
|
46
87
|
@profiles.each do |lang, profile|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
88
|
+
Babel.save_profile(lang, options)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Save a specific profile
|
93
|
+
# Options are:
|
94
|
+
# * :dir -> the directory wo save the files to. Defaults to Babel::PROFILE_DIR
|
95
|
+
# * :limit -> Call limit() on the profile before save. This reduces the size of the profile
|
96
|
+
# for the cost of (possibly) less accurate language guessing
|
97
|
+
def self.save_profile(lang, options = {})
|
98
|
+
dir = options[:dir] || PROFILE_DIR
|
99
|
+
profile = Babel.profile(lang)
|
100
|
+
profile.limit(options[:limit]) if options[:limit]
|
101
|
+
File.open(file_name(dir, lang), 'wb') do |file|
|
102
|
+
file.write(profile.ya2yaml)
|
51
103
|
end
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
# Build the file name for a profile file
|
109
|
+
# Naming scheme: profile_<LANG>.yml
|
110
|
+
def self.file_name(dir, lang)
|
111
|
+
File.join(dir, "profile_#{lang}.yml")
|
52
112
|
end
|
113
|
+
|
53
114
|
end
|
54
115
|
|
data/lib/babel/profile.rb
CHANGED
@@ -1,43 +1,61 @@
|
|
1
1
|
module Babel
|
2
2
|
class Profile
|
3
|
-
|
4
|
-
|
3
|
+
attr_reader :language
|
4
|
+
attr_reader :data
|
5
|
+
def initialize(language = nil)
|
6
|
+
@data = {}
|
5
7
|
@total_occurences = 0
|
8
|
+
@language = language
|
6
9
|
end
|
7
10
|
|
11
|
+
|
12
|
+
# learn a text
|
13
|
+
# following options are used when generating the n-grams:
|
14
|
+
# * min_length => 2
|
15
|
+
# * max_length => 5
|
16
|
+
# * pad => true
|
8
17
|
def learn(text, options = {})
|
9
18
|
options = {:min_length => 2, :max_length => 5, :pad => true}.merge(options)
|
10
19
|
text = clean(text)
|
11
20
|
text.split(' ').each do |word|
|
12
|
-
|
13
|
-
ngrams.each do |ngram|
|
21
|
+
word.n_grams(options).each do |ngram|
|
14
22
|
self.occured(ngram)
|
15
23
|
end
|
16
24
|
end
|
25
|
+
# after learning rank the new n-grams
|
17
26
|
self.rank
|
18
27
|
self # return self so we can chain learn commans. profile.learn('asasas').learn('asdsad')
|
19
28
|
end
|
20
29
|
|
21
30
|
|
31
|
+
def merge(other)
|
32
|
+
if self.language != other.language
|
33
|
+
raise ArgumentError.new("self has a language of #{self.language} but profile to merge has #{other.language}")
|
34
|
+
end
|
35
|
+
other.data.each do |key, value|
|
36
|
+
self.occured(key, value.first)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
22
40
|
# TODO: needed?
|
23
41
|
def clean(text)
|
24
42
|
return text
|
25
|
-
text = text.gsub(
|
26
|
-
text = text.gsub('.', '')
|
27
|
-
text = text.gsub(';', '')
|
43
|
+
text = text.gsub(/[0-9]/, '')
|
28
44
|
text = text.gsub(':', '')
|
45
|
+
text = text.gsub('/', '')
|
46
|
+
text = text.gsub('_', '')
|
29
47
|
text = text.gsub('(', '')
|
30
48
|
text = text.gsub(')', '')
|
31
|
-
text = text.gsub('
|
32
|
-
text = text.gsub(
|
33
|
-
|
34
|
-
text
|
49
|
+
text = text.gsub(';', '')
|
50
|
+
text = text.gsub('?', '')
|
51
|
+
|
52
|
+
return text
|
35
53
|
end
|
54
|
+
|
36
55
|
# limit this profile to n items
|
37
56
|
# profile needs to be ranked first
|
38
|
-
# do not use this if you plan to extend the profile later on
|
39
57
|
def limit(boundary = 100)
|
40
|
-
@
|
58
|
+
@data.reject! do |key, value|
|
41
59
|
raise 'Please call rank() first' if value.last == 0
|
42
60
|
boundary < value.last
|
43
61
|
end
|
@@ -46,37 +64,37 @@ module Babel
|
|
46
64
|
# rank the current profile
|
47
65
|
# ngrams are sorted by occurence and then ranked
|
48
66
|
def rank
|
49
|
-
|
50
|
-
|
51
|
-
end.each_with_index do |item, index|
|
52
|
-
|
53
|
-
end
|
67
|
+
#@data.values.sort do |o1, o2|
|
68
|
+
# o2.first <=> o1.first
|
69
|
+
#end.each_with_index do |item, index|
|
70
|
+
# item[1] = index + 1
|
71
|
+
#end
|
54
72
|
|
55
|
-
@
|
73
|
+
@data.values.each do |value|
|
56
74
|
value[1] = value[0] / @total_occurences.to_f
|
57
75
|
end
|
58
76
|
end
|
59
77
|
|
60
|
-
# Called when a
|
78
|
+
# Called when a n-gram is occured, optional you can pass an
|
61
79
|
# amount (how many times the ngram occured)
|
62
80
|
def occured(ngram, amount = 1)
|
63
|
-
(@
|
81
|
+
(@data[ngram] ||= [0, 0])[0] += amount
|
64
82
|
@total_occurences += amount
|
65
83
|
end
|
66
84
|
|
67
85
|
# find the occurence of a ngram. if it never occured, returns 0
|
68
86
|
def occurence(ngram)
|
69
|
-
@
|
87
|
+
@data[ngram] ? @data[ngram].first : 0
|
70
88
|
end
|
71
89
|
|
72
90
|
# find the ranking of a ngram. if it is not yet ranked, return 0
|
73
91
|
def ranking(ngram)
|
74
|
-
@
|
92
|
+
@data[ngram] ? @data[ngram].last : 0
|
75
93
|
end
|
76
94
|
|
77
95
|
# Calculate the distance to another profile
|
78
96
|
def distance(other)
|
79
|
-
@
|
97
|
+
@data.inject(0) do |memo, item|
|
80
98
|
other_ranking = other.ranking(item.first)
|
81
99
|
if other_ranking == 0
|
82
100
|
memo += 1
|
@@ -88,7 +106,7 @@ module Babel
|
|
88
106
|
|
89
107
|
|
90
108
|
def to_s
|
91
|
-
@
|
109
|
+
@data.inspect
|
92
110
|
end
|
93
111
|
end
|
94
112
|
end
|
@@ -1,15 +1,19 @@
|
|
1
1
|
class String
|
2
|
-
|
3
|
-
#
|
4
|
-
|
2
|
+
# Generate n-grams for a string.
|
3
|
+
# options are:
|
4
|
+
# :min_length : minimum length of the n-grams (defaults to 1)
|
5
|
+
# :max_length : maximum length of the n-grams (defaults to self.length)
|
6
|
+
# :pad : pad wiht '_' to generate all possible n-grams (defaults to false)
|
7
|
+
def n_grams(options = {})
|
8
|
+
# TODO: recursive?
|
9
|
+
# TODO: use min/max length for loop index instead of looping
|
10
|
+
# all and then use if test to decide if to add or not
|
5
11
|
min_length = options[:min_length] || 1
|
6
12
|
max_length = options[:max_length] || self.length
|
7
13
|
pad = options[:pad] || false
|
8
14
|
value = options[:preserve_case] ? self : self.downcase
|
9
15
|
value = "_#{value}#{'_' * (value.length - 1)}" if pad
|
10
16
|
res = []
|
11
|
-
# TODO: use min/max length for loop index instead of looping
|
12
|
-
# all and then use if test to decide if to add or not
|
13
17
|
0.upto(value.length - 1) do |index|
|
14
18
|
index.upto(value.length - 1) do |len|
|
15
19
|
if value[index..len].length >= min_length && value[index..len].length <= max_length
|
@@ -20,22 +24,23 @@ class String
|
|
20
24
|
res
|
21
25
|
end
|
22
26
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
# value = options[:preserve_case] ? self : self.downcase
|
27
|
-
# res = []
|
28
|
-
#
|
29
|
-
# end
|
30
|
-
|
31
|
-
|
32
|
-
# Ask Babel about the language of this text
|
33
|
-
# Can return nil if no language found
|
27
|
+
# Ask Babel about the language of this text.
|
28
|
+
# Convenience method, just calls Babel.guess().
|
29
|
+
# See Babel.guess for description.
|
34
30
|
def language(options = {})
|
35
31
|
Babel.guess(self, options)
|
36
32
|
end
|
33
|
+
# Ask Bable about the languages this text could be.
|
34
|
+
# It will return all the registered languages with the most probable
|
35
|
+
# Language first. You might want to restrict this before presenting to
|
36
|
+
# the user.
|
37
|
+
def languages(options = {})
|
38
|
+
Babel.distances(self, options).map() {|item| item.first}
|
39
|
+
end
|
37
40
|
|
38
|
-
# Tell Babel that this text is in a given language
|
41
|
+
# Tell Babel that this text is in a given language.
|
42
|
+
# Convenience method, just calls Babel.learn().
|
43
|
+
# See Babel.learn for description
|
39
44
|
def language=(lang, options = {})
|
40
45
|
Babel.learn(lang, self, options)
|
41
46
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
--- !ruby/object:Babel::Profile
|
2
|
-
|
2
|
+
data:
|
3
3
|
(i:
|
4
4
|
- 1
|
5
5
|
- 1.48557506610809e-05
|
@@ -25359,4 +25359,5 @@
|
|
25359
25359
|
ützu:
|
25360
25360
|
- 1
|
25361
25361
|
- 1.48557506610809e-05
|
25362
|
+
language: deu
|
25362
25363
|
total_occurences: 67314
|
@@ -1,5 +1,5 @@
|
|
1
1
|
--- !ruby/object:Babel::Profile
|
2
|
-
|
2
|
+
data:
|
3
3
|
? ",_"
|
4
4
|
:
|
5
5
|
- 94
|
@@ -20791,4 +20791,5 @@
|
|
20791
20791
|
‐se:
|
20792
20792
|
- 1
|
20793
20793
|
- 1.85742412422453e-05
|
20794
|
+
language: eng
|
20794
20795
|
total_occurences: 53838
|
@@ -1,5 +1,5 @@
|
|
1
1
|
--- !ruby/object:Babel::Profile
|
2
|
-
|
2
|
+
data:
|
3
3
|
? ",_"
|
4
4
|
:
|
5
5
|
- 118
|
@@ -24961,4 +24961,5 @@
|
|
24961
24961
|
’é:
|
24962
24962
|
- 8
|
24963
24963
|
- 0.000123525415354209
|
24964
|
+
language: fra
|
24964
24965
|
total_occurences: 64764
|