ul-wukong 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
@@ -0,0 +1,452 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# Taken from Yoichiro Yohasebe's [`wp2txt` project](https://github.com/yohasebe/wp2txt)
|
5
|
+
# with liberal modifications for our purposes.
|
6
|
+
#
|
7
|
+
# This software is distributed under the MIT License. Please see the `./wp2txt-LICENSE.txt` file.
|
8
|
+
|
9
|
+
require 'strscan'
|
10
|
+
require 'find'
|
11
|
+
require 'sanitize'
|
12
|
+
|
13
|
+
module Wp2txt
|
14
|
+
|
15
|
+
def format_wiki(original_text, has_retried = false)
|
16
|
+
begin
|
17
|
+
text = original_text + ""
|
18
|
+
|
19
|
+
text = chrref_to_utf(text)
|
20
|
+
text = escape_nowiki(text)
|
21
|
+
|
22
|
+
text = process_interwiki_links(text)
|
23
|
+
text = process_external_links(text)
|
24
|
+
|
25
|
+
text = remove_directive(text)
|
26
|
+
text = remove_emphasis(text)
|
27
|
+
|
28
|
+
text = mndash(text)
|
29
|
+
|
30
|
+
text = remove_hr(text)
|
31
|
+
|
32
|
+
return text
|
33
|
+
|
34
|
+
text = special_chr(text)
|
35
|
+
|
36
|
+
unescape_nowiki(text)
|
37
|
+
rescue # detect invalid byte sequence in UTF-8
|
38
|
+
if has_retried
|
39
|
+
puts "invalid byte sequence detected"
|
40
|
+
puts "******************************"
|
41
|
+
File.open("error_log.txt", "w") do |f|
|
42
|
+
f.write original_text
|
43
|
+
end
|
44
|
+
exit
|
45
|
+
else
|
46
|
+
fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
|
47
|
+
return format_wiki(fixed_text, true)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#################### parser for nested structure ####################
|
53
|
+
|
54
|
+
def process_nested_structure(scanner, left, right, &block)
|
55
|
+
buffer = ""
|
56
|
+
while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
|
57
|
+
# begin
|
58
|
+
case scanner[1]
|
59
|
+
when left
|
60
|
+
buffer << str
|
61
|
+
has_left = true
|
62
|
+
when right
|
63
|
+
if has_left
|
64
|
+
buffer = buffer[0...-(left.size)]
|
65
|
+
contents = block.call(str[0...-(left.size)])
|
66
|
+
buffer << contents
|
67
|
+
break
|
68
|
+
else
|
69
|
+
buffer << str
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
buffer << scanner.rest
|
74
|
+
|
75
|
+
if buffer == scanner.string
|
76
|
+
return scanner.string
|
77
|
+
else
|
78
|
+
scanner.string = buffer
|
79
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def remove_templates(str, only_not_inline = true)
|
84
|
+
scanner = StringScanner.new(str)
|
85
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
86
|
+
# if contents.index("\n")
|
87
|
+
# "\n"
|
88
|
+
# else
|
89
|
+
# "[tpl]#{contents}[/tpl]"
|
90
|
+
# end
|
91
|
+
''
|
92
|
+
end
|
93
|
+
rescue SystemStackError => err
|
94
|
+
Wukong.bad_record("Poorly nested templates", err, str)
|
95
|
+
return str.gsub!(/{{.*}}/m, "**BAD TEMPLATE**")
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
#################### methods used from format_wiki ####################
|
100
|
+
|
101
|
+
def escape_nowiki(str)
|
102
|
+
if @nowikis
|
103
|
+
@nowikis.clear
|
104
|
+
else
|
105
|
+
@nowikis = {}
|
106
|
+
end
|
107
|
+
str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
|
108
|
+
nowiki = $1
|
109
|
+
nowiki_id = nowiki.object_id
|
110
|
+
@nowikis[nowiki_id] = nowiki
|
111
|
+
"<nowiki nowikiid=\"#{nowiki_id}\">"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def unescape_nowiki(str)
|
116
|
+
str.gsub(/<nowiki nowikiid=\"(\d+)\">/) do
|
117
|
+
obj_id = $1.to_i
|
118
|
+
@nowikis[obj_id]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def process_interwiki_links(str)
|
123
|
+
scanner = StringScanner.new(str)
|
124
|
+
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
125
|
+
str_new = ""
|
126
|
+
parts = contents.split("|")
|
127
|
+
case parts.size
|
128
|
+
when 1
|
129
|
+
parts.first || ""
|
130
|
+
else
|
131
|
+
parts.shift
|
132
|
+
parts.join("|")
|
133
|
+
end
|
134
|
+
end
|
135
|
+
result
|
136
|
+
rescue SystemStackError => err
|
137
|
+
Wukong.bad_record("Poorly nested internal links", err, str)
|
138
|
+
return str.gsub!(/\[\[.*\]\]/m, "**BAD INTERWIKI LINKS**")
|
139
|
+
end
|
140
|
+
|
141
|
+
def process_external_links(str)
|
142
|
+
scanner = StringScanner.new(str)
|
143
|
+
result = process_nested_structure(scanner, "[", "]") do |contents|
|
144
|
+
parts = contents.split(" ", 2)
|
145
|
+
case parts.size
|
146
|
+
when 1
|
147
|
+
parts.first || ""
|
148
|
+
else
|
149
|
+
parts.last || ""
|
150
|
+
end
|
151
|
+
end
|
152
|
+
result
|
153
|
+
rescue SystemStackError => err
|
154
|
+
Wukong.bad_record("Poorly nested external links", err, str)
|
155
|
+
return str.gsub!(/\[.*\]/m, "**BAD EXTERNAL LINKS**")
|
156
|
+
end
|
157
|
+
|
158
|
+
def special_chr(str)
|
159
|
+
unless @sp_hash
|
160
|
+
html = [' ', '<', '>', '&', '"']\
|
161
|
+
.zip([' ', '<', '>', '&', '"'])
|
162
|
+
|
163
|
+
umraut_accent = ['À', 'Á', 'Â', 'Ã', 'Ä',
|
164
|
+
'Å', 'Æ', 'Ç', 'È', 'É', 'Ê',
|
165
|
+
'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ñ',
|
166
|
+
'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
|
167
|
+
'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
168
|
+
'á', 'â', 'ã', 'ä', 'å', 'æ',
|
169
|
+
'ç', 'è', 'é', 'ê', 'ë', 'ì',
|
170
|
+
'í', 'î', 'ï', 'ñ', 'ò', 'ó',
|
171
|
+
'ô', 'œ', 'õ', 'ö', 'ø', 'ù',
|
172
|
+
'ú', 'û', 'ü', 'ÿ']\
|
173
|
+
.zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
|
174
|
+
'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
175
|
+
'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
|
176
|
+
'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
|
177
|
+
|
178
|
+
punctuation = ['¿', '¡', '«', '»', '§',
|
179
|
+
'¶', '†', '‡', '•', '–', '—']\
|
180
|
+
.zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
|
181
|
+
|
182
|
+
commercial = ['™', '©', '®', '¢', '€', '¥',
|
183
|
+
'£', '¤'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
|
184
|
+
|
185
|
+
greek_chr = ['α', 'β', 'γ', 'δ', 'ε',
|
186
|
+
'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ',
|
187
|
+
'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς',
|
188
|
+
'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'Γ',
|
189
|
+
'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ',
|
190
|
+
'Ψ', 'Ω']\
|
191
|
+
.zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
|
192
|
+
'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
|
193
|
+
'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
|
194
|
+
|
195
|
+
math_chr1 = ['∫', '∑', '∏', '√', '−', '±',
|
196
|
+
'∞', '≈', '∝', '≡', '≠', '≤', '≥',
|
197
|
+
'×', '·', '÷', '∂', '′', '″',
|
198
|
+
'∇', '‰', '°', '∴', 'ø', '∈', '∩',
|
199
|
+
'∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨',
|
200
|
+
'∃', '∀', '⇒', '⇔', '→', '↔', '↑']\
|
201
|
+
.zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
|
202
|
+
'≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
|
203
|
+
'∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
|
204
|
+
'⇔', '→', '↔', '↑'])
|
205
|
+
|
206
|
+
math_chr2 = ['ℵ', '∉'].zip(['ℵ', '∉'])
|
207
|
+
|
208
|
+
others = ['¨', 'ª',
|
209
|
+
'¯', '´', 'µ', '¸', 'º', '‘', '’',
|
210
|
+
'“', '‚', '”', '„', '♠', '♣', '◊',
|
211
|
+
'♥', '←', '♦', '‹', '›', '↓']\
|
212
|
+
.zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
|
213
|
+
'„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
|
214
|
+
|
215
|
+
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
216
|
+
math_chr1 + math_chr2 + others
|
217
|
+
@sp_hash = Hash[*spc_array.flatten]
|
218
|
+
@sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
|
219
|
+
end
|
220
|
+
#str.gsub!("&"){'&'}
|
221
|
+
str.gsub!(@sp_regex) do
|
222
|
+
@sp_hash[$1]
|
223
|
+
end
|
224
|
+
return str
|
225
|
+
end
|
226
|
+
|
227
|
+
def remove_tag(str, tagset = ['<', '>'])
|
228
|
+
if tagset == ['<', '>']
|
229
|
+
return remove_html_tag(str)
|
230
|
+
end
|
231
|
+
tagsets = Regexp.quote(tagset.uniq.join(""))
|
232
|
+
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
233
|
+
newstr = str.gsub(regex, "")
|
234
|
+
# newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
|
235
|
+
return newstr
|
236
|
+
end
|
237
|
+
|
238
|
+
def remove_html_tag(str)
|
239
|
+
str = ::Sanitize.clean(str)
|
240
|
+
end
|
241
|
+
|
242
|
+
def clean_html(text)
|
243
|
+
text.gsub!(%r{<(\w+)\s[^>]*?(/?)>}, '<\1\2>' )
|
244
|
+
text = ::Sanitize.clean(text, remove_contents: ['ref'])
|
245
|
+
end
|
246
|
+
|
247
|
+
def remove_emphasis(str)
|
248
|
+
str.gsub(/(''+)(.+?)\1/) do
|
249
|
+
$2
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def chrref_to_utf(num_str)
|
254
|
+
begin
|
255
|
+
utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
|
256
|
+
if $1 == 'x'
|
257
|
+
ch = $2.to_i(16)
|
258
|
+
else
|
259
|
+
ch = $2.to_i
|
260
|
+
end
|
261
|
+
hi = ch>>8
|
262
|
+
lo = ch&0xff
|
263
|
+
u = "\377\376" << lo.chr << hi.chr
|
264
|
+
u.encode("UTF-8", "UTF-16")
|
265
|
+
end
|
266
|
+
rescue StandardError
|
267
|
+
return num_str
|
268
|
+
end
|
269
|
+
return utf_str
|
270
|
+
end
|
271
|
+
|
272
|
+
def remove_directive(str)
|
273
|
+
remove_tag(str, ['__', '__'])
|
274
|
+
end
|
275
|
+
|
276
|
+
def mndash(str)
|
277
|
+
str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
|
278
|
+
end
|
279
|
+
|
280
|
+
def remove_hr(page)
|
281
|
+
page = page.gsub(/^\s*\-+\s*$/, "")
|
282
|
+
end
|
283
|
+
|
284
|
+
def make_reference(str)
|
285
|
+
str.gsub!(%r{<br ?\/>}m, "\n")
|
286
|
+
str.gsub!(%r{<ref[^>]*\/>}m, '')
|
287
|
+
str.gsub!(%r{<ref[^>]*>.*?<\/ref>}m, '')
|
288
|
+
str
|
289
|
+
end
|
290
|
+
|
291
|
+
def format_ref(page)
|
292
|
+
page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
|
293
|
+
ref = $1.dup
|
294
|
+
ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
#################### methods currently unused ####################
|
299
|
+
|
300
|
+
def process_template(str)
|
301
|
+
scanner = StringScanner.new(str)
|
302
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
303
|
+
parts = contents.split("|")
|
304
|
+
case parts.size
|
305
|
+
when 0
|
306
|
+
""
|
307
|
+
when 1
|
308
|
+
parts.first || ""
|
309
|
+
else
|
310
|
+
if parts.last.split("=").size > 1
|
311
|
+
parts.first || ""
|
312
|
+
else
|
313
|
+
parts.last || ""
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
result
|
318
|
+
rescue SystemStackError => err
|
319
|
+
Wukong.bad_record("Poorly nested templates", err, str)
|
320
|
+
return str.gsub!(/\[\[.*\]\]/m, "**BAD TEMPLATES**")
|
321
|
+
end
|
322
|
+
|
323
|
+
def remove_table(str)
|
324
|
+
new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
|
325
|
+
if str != new_str
|
326
|
+
new_str = remove_table(new_str)
|
327
|
+
end
|
328
|
+
new_str = remove_table(new_str) unless str == new_str
|
329
|
+
return new_str
|
330
|
+
end
|
331
|
+
|
332
|
+
def remove_clade(page)
|
333
|
+
new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
|
334
|
+
new_page = remove_clade(new_page) unless page == new_page
|
335
|
+
new_page
|
336
|
+
end
|
337
|
+
|
338
|
+
def remove_inline_template(str)
|
339
|
+
str.gsub(/\{\{(.*?)\}\}/) do
|
340
|
+
key = $1
|
341
|
+
if /\A[^\|]+\z/ =~ key
|
342
|
+
result = key
|
343
|
+
else
|
344
|
+
info = key.split("|")
|
345
|
+
type_code = info.first
|
346
|
+
case type_code
|
347
|
+
when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
|
348
|
+
/\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
|
349
|
+
/\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
|
350
|
+
out = info[-1]
|
351
|
+
else
|
352
|
+
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
353
|
+
end
|
354
|
+
result = out
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
#################### file related utilities ####################
|
360
|
+
|
361
|
+
# collect filenames recursively
|
362
|
+
def collect_files(str, regex = nil)
|
363
|
+
regex ||= //
|
364
|
+
text_array = Array.new
|
365
|
+
Find.find(str) do |f|
|
366
|
+
text_array << f if regex =~ f
|
367
|
+
end
|
368
|
+
text_array.sort
|
369
|
+
end
|
370
|
+
|
371
|
+
# modify a file using block/yield mechanism
|
372
|
+
def file_mod(file_path, backup = false, &block)
|
373
|
+
File.open(file_path, "r") do |fr|
|
374
|
+
str = fr.read
|
375
|
+
newstr = yield(str)
|
376
|
+
str = newstr unless newstr == nil
|
377
|
+
File.open("temp", "w") do |tf|
|
378
|
+
tf.write(str)
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
File.rename(file_path, file_path + ".bak")
|
383
|
+
File.rename("temp", file_path)
|
384
|
+
File.unlink(file_path + ".bak") unless backup
|
385
|
+
end
|
386
|
+
|
387
|
+
# modify files under a directry (recursive)
|
388
|
+
def batch_file_mod(dir_path, &block)
|
389
|
+
if FileTest.directory?(dir_path)
|
390
|
+
collect_files(dir_path).each do |file|
|
391
|
+
yield file if FileTest.file?(file)
|
392
|
+
end
|
393
|
+
else
|
394
|
+
yield dir_path if FileTest.file?(dir_path)
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
# take care of difference of separators among environments
|
399
|
+
def correct_separator(input)
|
400
|
+
if input.is_a?(String)
|
401
|
+
ret_str = String.new
|
402
|
+
if RUBY_PLATFORM.index("win32")
|
403
|
+
ret_str = input.gsub("/", "\\")
|
404
|
+
else
|
405
|
+
ret_str = input.gsub("\\", "/")
|
406
|
+
end
|
407
|
+
return ret_str
|
408
|
+
elsif input.is_a?(Array)
|
409
|
+
ret_array = Array.new
|
410
|
+
input.each do |item|
|
411
|
+
ret_array << correct_separator(item)
|
412
|
+
end
|
413
|
+
return ret_array
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
def rename(files)
|
418
|
+
# num of digits necessary to name the last file generated
|
419
|
+
maxwidth = 0
|
420
|
+
|
421
|
+
files.each do |f|
|
422
|
+
width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
|
423
|
+
maxwidth = width if maxwidth < width
|
424
|
+
end
|
425
|
+
|
426
|
+
files.each do |f|
|
427
|
+
newname= f.sub(/\-(\d+)\z/) do
|
428
|
+
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
429
|
+
end
|
430
|
+
File.rename(f, newname + ".txt")
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
# convert int of seconds to string in the format 00:00:00
|
435
|
+
def sec_to_str(int)
|
436
|
+
unless int
|
437
|
+
str = "--:--:--"
|
438
|
+
return str
|
439
|
+
end
|
440
|
+
h = int / 3600
|
441
|
+
m = (int - h * 3600) / 60
|
442
|
+
s = int % 60
|
443
|
+
str = sprintf("%02d:%02d:%02d", h, m, s)
|
444
|
+
return str
|
445
|
+
end
|
446
|
+
|
447
|
+
def decimal_format(i)
|
448
|
+
str = i.to_s.reverse
|
449
|
+
return str.scan(/.?.?./).join(',').reverse
|
450
|
+
end
|
451
|
+
|
452
|
+
end
|