wp2txt 0.5.1 → 0.5.02

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -30,20 +30,25 @@ Command line options are as follows:
30
30
 
31
31
  Usage: wp2txt [options]
32
32
  where [options] are:
33
- --input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format
34
- --output-dir, -o <s>: Output directory (default: Present working directory)
35
- --convert-off, -c: Output XML (without converting to plain text)
36
- --list-off, -l: Exclude list items from output
37
- --heading-off, -d: Exclude section titles from output
38
- --title-off, -t: Exclude page titles from output
39
- --table-off, --no-table-off, -a: Exclude page titles from output (default: true)
40
- --template-off, --no-template-off, -e: Remove template notations from output (default: true)
41
- --redirect-off, -r: Not show redirect destination
42
- --strip-marker, -s: Remove symbols prefixed to list items, definitions, etc.
43
- --category-off, -g: Not show article category information
44
- --file-size, -f <i>: Approximate size (in MB) of each output file (default: 10)
45
- --version, -v: Print version and exit
46
- --help, -h: Show this message
33
+ --input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt
34
+ (uncompressed) format
35
+ --output-dir, -o <s>: Output directory (default:
36
+ /Users/yohasebe/Dropbox/code/wp2txt)
37
+ --convert-off, -c: Output XML (without converting to plain text)
38
+ --list-off, -l: Exclude list items from output
39
+ --heading-off, -d: Exclude section titles from output
40
+ --title-off, -t: Exclude page titles from output
41
+ --table-off, --no-table-off, -a: Exclude page titles from output (default: true)
42
+ --template-off, --no-template-off, -e: Remove template notations from output (default:
43
+ true)
44
+ --redirect-off, -r: Not show redirect destination
45
+ --strip-marker, -s: Remove symbols prefixed to list items, definitions,
46
+ etc.
47
+ --category-off, -g: Not show article category information
48
+ --file-size, -f <i>: Approximate size (in MB) of each output file
49
+ (default: 10)
50
+ --version, -v: Print version and exit
51
+ --help, -h: Show this message
47
52
 
48
53
  ### Limitations ###
49
54
 
@@ -70,9 +70,8 @@ module Wp2txt
70
70
  @@blank_line_regex = Regexp.new('^\s*$')
71
71
 
72
72
  @@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
73
-
74
- category_patterns = ["Category", "Categoria"].join("|")
75
- @@category_regex = Regexp.new('[\{\[\|\b](?:' + category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
73
+
74
+ @@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
76
75
 
77
76
  def initialize(text, title = "", strip_tmarker = false)
78
77
  @title = title.strip
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.02"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.02
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-16 00:00:00.000000000 Z
12
+ date: 2013-01-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec