wp2txt 0.5.1 → 0.5.02
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +19 -14
- data/lib/wp2txt/article.rb +2 -3
- data/lib/wp2txt/version.rb +1 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -30,20 +30,25 @@ Command line options are as follows:
|
|
30
30
|
|
31
31
|
Usage: wp2txt [options]
|
32
32
|
where [options] are:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
33
|
+
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt
|
34
|
+
(uncompressed) format
|
35
|
+
--output-dir, -o <s>: Output directory (default:
|
36
|
+
/Users/yohasebe/Dropbox/code/wp2txt)
|
37
|
+
--convert-off, -c: Output XML (without converting to plain text)
|
38
|
+
--list-off, -l: Exclude list items from output
|
39
|
+
--heading-off, -d: Exclude section titles from output
|
40
|
+
--title-off, -t: Exclude page titles from output
|
41
|
+
--table-off, --no-table-off, -a: Exclude page titles from output (default: true)
|
42
|
+
--template-off, --no-template-off, -e: Remove template notations from output (default:
|
43
|
+
true)
|
44
|
+
--redirect-off, -r: Not show redirect destination
|
45
|
+
--strip-marker, -s: Remove symbols prefixed to list items, definitions,
|
46
|
+
etc.
|
47
|
+
--category-off, -g: Not show article category information
|
48
|
+
--file-size, -f <i>: Approximate size (in MB) of each output file
|
49
|
+
(default: 10)
|
50
|
+
--version, -v: Print version and exit
|
51
|
+
--help, -h: Show this message
|
47
52
|
|
48
53
|
### Limitations ###
|
49
54
|
|
data/lib/wp2txt/article.rb
CHANGED
@@ -70,9 +70,8 @@ module Wp2txt
|
|
70
70
|
@@blank_line_regex = Regexp.new('^\s*$')
|
71
71
|
|
72
72
|
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
-
|
74
|
-
|
75
|
-
@@category_regex = Regexp.new('[\{\[\|\b](?:' + category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
73
|
+
|
74
|
+
@@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
|
76
75
|
|
77
76
|
def initialize(text, title = "", strip_tmarker = false)
|
78
77
|
@title = title.strip
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.02
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|