wp2txt 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +14 -15
- data/bin/wp2txt +1 -1
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/version.rb +1 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -16,9 +16,6 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
|
|
16
16
|
|
17
17
|
### Installation
|
18
18
|
|
19
|
-
$ gem install bundler
|
20
|
-
$ bundle install
|
21
|
-
|
22
19
|
$ gem install wp2txt
|
23
20
|
|
24
21
|
### Usage
|
@@ -33,18 +30,20 @@ Command line options are as follows:
|
|
33
30
|
|
34
31
|
Usage: wp2txt [options]
|
35
32
|
where [options] are:
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
33
|
+
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format
|
34
|
+
--output-dir, -o <s>: Output directory (default: Present working directory)
|
35
|
+
--convert-off, -c: Output XML (without converting to plain text)
|
36
|
+
--list-off, -l: Exclude list items from output
|
37
|
+
--heading-off, -d: Exclude section titles from output
|
38
|
+
--title-off, -t: Exclude page titles from output
|
39
|
+
--table-off, --no-table-off, -a: Exclude page titles from output (default: true)
|
40
|
+
--template-off, --no-template-off, -e: Remove template notations from output (default: true)
|
41
|
+
--redirect-off, -r: Not show redirect destination
|
42
|
+
--strip-marker, -s: Remove symbols prefixed to list items, definitions, etc.
|
43
|
+
--category-off, -g: Not show article category information
|
44
|
+
--file-size, -f <i>: Approximate size (in MB) of each output file (default: 10)
|
45
|
+
--version, -v: Print version and exit
|
46
|
+
--help, -h: Show this message
|
48
47
|
|
49
48
|
### Limitations ###
|
50
49
|
|
data/bin/wp2txt
CHANGED
@@ -34,7 +34,7 @@ EOS
|
|
34
34
|
opt :template_off, "Remove template notations from output", :default => true
|
35
35
|
opt :redirect_off, "Not show redirect destination", :default => false
|
36
36
|
opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
|
37
|
-
opt :category_off, "Not
|
37
|
+
opt :category_off, "Not show article category information", :default => false
|
38
38
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
39
|
end
|
40
40
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
data/lib/wp2txt/article.rb
CHANGED
@@ -70,8 +70,9 @@ module Wp2txt
|
|
70
70
|
@@blank_line_regex = Regexp.new('^\s*$')
|
71
71
|
|
72
72
|
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
-
|
74
|
-
|
73
|
+
|
74
|
+
category_patterns = ["Category", "Categoria"].join("|")
|
75
|
+
@@category_regex = Regexp.new('[\{\[\|\b](?:' + category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
75
76
|
|
76
77
|
def initialize(text, title = "", strip_tmarker = false)
|
77
78
|
@title = title.strip
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|