wp2txt 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +14 -15
- data/bin/wp2txt +1 -1
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/version.rb +1 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -16,9 +16,6 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
|
|
16
16
|
|
17
17
|
### Installation
|
18
18
|
|
19
|
-
$ gem install bundler
|
20
|
-
$ bundle install
|
21
|
-
|
22
19
|
$ gem install wp2txt
|
23
20
|
|
24
21
|
### Usage
|
@@ -33,18 +30,20 @@ Command line options are as follows:
|
|
33
30
|
|
34
31
|
Usage: wp2txt [options]
|
35
32
|
where [options] are:
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
33
|
+
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format
|
34
|
+
--output-dir, -o <s>: Output directory (default: Present working directory)
|
35
|
+
--convert-off, -c: Output XML (without converting to plain text)
|
36
|
+
--list-off, -l: Exclude list items from output
|
37
|
+
--heading-off, -d: Exclude section titles from output
|
38
|
+
--title-off, -t: Exclude page titles from output
|
39
|
+
--table-off, --no-table-off, -a: Exclude page titles from output (default: true)
|
40
|
+
--template-off, --no-template-off, -e: Remove template notations from output (default: true)
|
41
|
+
--redirect-off, -r: Not show redirect destination
|
42
|
+
--strip-marker, -s: Remove symbols prefixed to list items, definitions, etc.
|
43
|
+
--category-off, -g: Not show article category information
|
44
|
+
--file-size, -f <i>: Approximate size (in MB) of each output file (default: 10)
|
45
|
+
--version, -v: Print version and exit
|
46
|
+
--help, -h: Show this message
|
48
47
|
|
49
48
|
### Limitations ###
|
50
49
|
|
data/bin/wp2txt
CHANGED
@@ -34,7 +34,7 @@ EOS
|
|
34
34
|
opt :template_off, "Remove template notations from output", :default => true
|
35
35
|
opt :redirect_off, "Not show redirect destination", :default => false
|
36
36
|
opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
|
37
|
-
opt :category_off, "Not
|
37
|
+
opt :category_off, "Not show article category information", :default => false
|
38
38
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
39
|
end
|
40
40
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
data/lib/wp2txt/article.rb
CHANGED
@@ -70,8 +70,9 @@ module Wp2txt
|
|
70
70
|
@@blank_line_regex = Regexp.new('^\s*$')
|
71
71
|
|
72
72
|
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
-
|
74
|
-
|
73
|
+
|
74
|
+
category_patterns = ["Category", "Categoria"].join("|")
|
75
|
+
@@category_regex = Regexp.new('[\{\[\|\b](?:' + category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
75
76
|
|
76
77
|
def initialize(text, title = "", strip_tmarker = false)
|
77
78
|
@title = title.strip
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|